diff --git a/CHANGELOG.md b/CHANGELOG.md index 8599977452207..875a67d92444c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Implement GRPC Search params `Highlight`and `Sort` ([#19868](https://github.com/opensearch-project/OpenSearch/pull/19868)) - Implement GRPC ConstantScoreQuery, FuzzyQuery, MatchBoolPrefixQuery, MatchPhrasePrefix, PrefixQuery, MatchQuery ([#19854](https://github.com/opensearch-project/OpenSearch/pull/19854)) - Add async periodic flush task support for pull-based ingestion ([#19878](https://github.com/opensearch-project/OpenSearch/pull/19878)) +- Multifold Improvement in Multi-Clause Boolean Query, Window Scoring Approach ([#19045](https://github.com/opensearch-project/OpenSearch/pull/19046)) + ### Changed - Faster `terms` query creation for `keyword` field with index and docValues enabled ([#19350](https://github.com/opensearch-project/OpenSearch/pull/19350)) diff --git a/server/src/main/java/org/opensearch/index/query/BoolQueryBuilder.java b/server/src/main/java/org/opensearch/index/query/BoolQueryBuilder.java index f2e7565c885c1..b440cf65762b6 100644 --- a/server/src/main/java/org/opensearch/index/query/BoolQueryBuilder.java +++ b/server/src/main/java/org/opensearch/index/query/BoolQueryBuilder.java @@ -45,6 +45,8 @@ import org.opensearch.core.xcontent.ObjectParser; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.search.approximate.ApproximateBooleanQuery; +import org.opensearch.search.approximate.ApproximateScoreQuery; import java.io.IOException; import java.util.ArrayList; @@ -335,7 +337,18 @@ protected Query doToQuery(QueryShardContext context) throws IOException { } Query query = Queries.applyMinimumShouldMatch(booleanQuery, minimumShouldMatch); - return adjustPureNegative ? fixNegativeQueryIfNeeded(query) : query; + + if (adjustPureNegative) { + query = fixNegativeQueryIfNeeded(query); + } + + // limit approximate query construction since several mappers (prefixQuery) expect a BooleanQuery not ApproximateBooleanQuery + if (query instanceof BooleanQuery boolQuery + && (boolQuery.getClauses(Occur.FILTER).size() == boolQuery.clauses().size() || boolQuery.clauses().size() == 1)) { + return new ApproximateScoreQuery(query, new ApproximateBooleanQuery(boolQuery)); + } + + return query; } private static void addBooleanClauses( diff --git a/server/src/main/java/org/opensearch/search/approximate/ApproximateBooleanQuery.java b/server/src/main/java/org/opensearch/search/approximate/ApproximateBooleanQuery.java new file mode 100644 index 0000000000000..0b61bfef8d430 --- /dev/null +++ b/server/src/main/java/org/opensearch/search/approximate/ApproximateBooleanQuery.java @@ -0,0 +1,214 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.search.approximate; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.Weight; +import org.opensearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * An approximate-able version of {@link BooleanQuery}. For single clause boolean queries, + * it unwraps the query into the singular clause and ensures approximation is applied. + */ +public class ApproximateBooleanQuery extends ApproximateQuery { + public final BooleanQuery boolQuery; + private final int size; + private final List clauses; + private boolean isTopLevel = true; // Default to true, set to false when nested in boolean query + + public ApproximateBooleanQuery(BooleanQuery boolQuery) { + this(boolQuery, SearchContext.DEFAULT_TRACK_TOTAL_HITS_UP_TO); + } + + protected ApproximateBooleanQuery(BooleanQuery boolQuery, int size) { + this.boolQuery = boolQuery; + this.size = size; + this.clauses = boolQuery.clauses(); + } + + public BooleanQuery getBooleanQuery() { + return boolQuery; + } + + public boolean isTopLevel() { + return this.isTopLevel; + } + + public void setTopLevel(boolean isTopLevel) { + this.isTopLevel = isTopLevel; + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + return super.rewrite(indexSearcher); + } + + public static Query boolRewrite(Query query, IndexSearcher indexSearcher) { + if (query instanceof BooleanQuery boolQuery) { + return (boolQuery.clauses().size() == 1) ? boolRewrite(boolQuery.clauses().get(0).query(), indexSearcher) : query; + } else if (query instanceof ApproximateBooleanQuery appxBool) { + return (appxBool.getBooleanQuery().clauses().size() == 1) + ? boolRewrite(appxBool.boolQuery.clauses().get(0).query(), indexSearcher) + : query; + } + try { + return query.rewrite(indexSearcher); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected boolean canApproximate(SearchContext context) { + if (context == null) { + return false; + } + + // Don't approximate if we need accurate total hits + if (context.trackTotalHitsUpTo() == SearchContext.TRACK_TOTAL_HITS_ACCURATE) { + return false; + } + + // Don't approximate if we have aggregations + if (context.aggregations() != null) { + return false; + } + + // Don't approximate if highlighting is enabled + if (context.highlight() != null) { + return false; + } + + if (!isTopLevel) { + return false; + } + + // For single clause boolean queries, check if the clause can be approximated + if (clauses.size() == 1 && clauses.get(0).occur() != BooleanClause.Occur.MUST_NOT) { + // If the clause is already an ApproximateScoreQuery, we can approximate + set context + if (clauses.get(0).query() instanceof ApproximateScoreQuery approximateScoreQuery) { + if (approximateScoreQuery.getApproximationQuery() instanceof ApproximateBooleanQuery nestedBool) { + return nestedBool.canApproximate(context); + } + return approximateScoreQuery.getApproximationQuery().canApproximate(context); + } + return false; + } + + boolean hasApproximate = false; + + // multi clause case - we might want to consider strategies for nested cases, for now limit to just top level + for (BooleanClause clause : clauses) { + if (clause.occur() != BooleanClause.Occur.FILTER) { + return false; + } else { + if (clause.query() instanceof ApproximateScoreQuery appxScore) { + if (appxScore.getApproximationQuery() instanceof ApproximatePointRangeQuery) { + hasApproximate = true; + } + if (appxScore.getApproximationQuery() instanceof ApproximateBooleanQuery || clause.query() instanceof BooleanQuery) { + return false; + } + } + } + } + + return hasApproximate; + } + + @Override + public ConstantScoreWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + // For single clause boolean queries, delegate to the clause's createWeight + if (clauses.size() == 1 && clauses.get(0).occur() != BooleanClause.Occur.MUST_NOT) { + Query clauseQuery = clauses.get(0).query(); + + // If it's a scoring query, wrap it in a ConstantScoreQuery to ensure constant scoring + if (!(clauseQuery instanceof ConstantScoreQuery)) { + clauseQuery = new ConstantScoreQuery(clauseQuery); + } + + return (ConstantScoreWeight) clauseQuery.createWeight(searcher, scoreMode, boost); + } + + // For multi-clause boolean queries, create a custom weight + return new ApproximateBooleanWeight(searcher, scoreMode, boost); + } + + /** + * Custom Weight implementation for ApproximateBooleanQuery that handles multi-clause boolean queries. + * This is a basic implementation that behaves like a regular filter boolean query for now. + */ + private class ApproximateBooleanWeight extends ConstantScoreWeight { + private final ScoreMode scoreMode; + private final IndexSearcher searcher; + private final float boost; + + public ApproximateBooleanWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + super(ApproximateBooleanQuery.this, boost); + // Create a weight for the underlying boolean query + this.scoreMode = scoreMode; + this.searcher = searcher; + this.boost = boost; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + // For multi-clause boolean queries, create a custom scorer supplier + List clauseWeights = new ArrayList<>(clauses.size()); + for (BooleanClause clause : clauses) { + Weight weight = clause.query().createWeight(searcher, scoreMode, boost); + clauseWeights.add(weight); + } + + return new ApproximateBooleanScorerSupplier(clauseWeights, scoreMode, boost, size, context); + } + + } + + @Override + public String toString(String s) { + return "ApproximateBooleanQuery(" + boolQuery.toString(s) + ")"; + } + + @Override + public void visit(QueryVisitor queryVisitor) { + boolQuery.visit(queryVisitor); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ApproximateBooleanQuery that = (ApproximateBooleanQuery) o; + return size == that.size && boolQuery.equals(that.boolQuery); + } + + @Override + public int hashCode() { + return boolQuery.hashCode(); + } +} diff --git a/server/src/main/java/org/opensearch/search/approximate/ApproximateBooleanScorerSupplier.java b/server/src/main/java/org/opensearch/search/approximate/ApproximateBooleanScorerSupplier.java new file mode 100644 index 0000000000000..31cb9a400f4aa --- /dev/null +++ b/server/src/main/java/org/opensearch/search/approximate/ApproximateBooleanScorerSupplier.java @@ -0,0 +1,306 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.search.approximate; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.ConjunctionUtils; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.Bits; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; + +/** + * A ScorerSupplier implementation for ApproximateBooleanQuery that creates resumable DocIdSetIterators + * for each clause and uses Lucene's ConjunctionUtils to coordinate them. + */ +public class ApproximateBooleanScorerSupplier extends ScorerSupplier { + private final List clauseWeights; + private final List cachedSuppliers; // Cache suppliers to avoid repeated calls + private final ScoreMode scoreMode; + private final float boost; + private final int size; + private long cost = -1; + private int scalingFactor = 3; + + /** + * Creates a new ApproximateBooleanScorerSupplier. + * + * @param clauseWeights The weights for each clause in the boolean query + * @param scoreMode The score mode + * @param boost The boost factor + * @param size The threshold for early termination + * @param context The leaf reader context + * @throws IOException If there's an error creating scorer suppliers + */ + public ApproximateBooleanScorerSupplier( + List clauseWeights, + ScoreMode scoreMode, + float boost, + int size, + LeafReaderContext context + ) throws IOException { + this.clauseWeights = new ArrayList<>(); + this.cachedSuppliers = new ArrayList<>(); + this.scoreMode = scoreMode; + this.boost = boost; + this.size = size; + + // Store weights and cache their suppliers + for (Weight clauseWeight : clauseWeights) { + ScorerSupplier supplier = clauseWeight.scorerSupplier(context); + if (supplier != null) { + this.clauseWeights.add(clauseWeight); + this.cachedSuppliers.add(supplier); + } + } + } + + public void setScalingWindowFactor(int factor) { + scalingFactor = factor; + } + + /** + * Get the {@link Scorer}. This may not return {@code null} and must be called at most once. + * + * @param leadCost Cost of the scorer that will be used in order to lead iteration. + */ + @Override + public Scorer get(long leadCost) throws IOException { + // should not get called in a non-top level query + return null; + } + + /** + * Get an estimate of the {@link Scorer} that would be returned by {@link #get}. + */ + @Override + public long cost() { + if (cost == -1) { + // Estimate cost as the minimum cost of all clauses (conjunction) + if (!cachedSuppliers.isEmpty()) { + cost = Long.MAX_VALUE; + for (ScorerSupplier supplier : cachedSuppliers) { + cost = Math.min(cost, supplier.cost()); + } + } else { + cost = 0; + } + } + return cost; + } + + /** + * Get a scorer that is optimized for bulk-scoring. + */ + @Override + public BulkScorer bulkScorer() throws IOException { + if (clauseWeights.isEmpty()) { + return null; + } + + // Calculate window size heuristic using cached suppliers + long minCost = Long.MAX_VALUE; + long maxCost = 0; + for (ScorerSupplier supplier : cachedSuppliers) { + long cost = supplier.cost(); + minCost = Math.min(minCost, cost); + maxCost = Math.max(maxCost, cost); + } + final int initialWindowSize = Math.max((1 << 15), (int) Math.min(minCost, maxCost / (1 << 7))); // Ensure minimum 10k + + // Create a simple scorer for the collector (will be used by windowed approach) + Scorer scorer = new Scorer() { + @Override + public DocIdSetIterator iterator() { + // This won't be used in windowed approach + return DocIdSetIterator.empty(); + } + + @Override + public float score() throws IOException { + return 0.0f; + } + + @Override + public float getMaxScore(int upTo) throws IOException { + return 0.0f; + } + + @Override + public int docID() { + return -1; + } + }; + + // Create a simple bulk scorer that wraps the conjunction + return new BulkScorer() { + private int totalCollected = 0; + private BitSet collectedDocs = new BitSet(); // Track collected documents + + // Windowed approach state + private int currentWindowSize = initialWindowSize; + private DocIdSetIterator globalConjunction = null; + + private List rebuildIteratorsWithWindowSize(int windowSize) throws IOException { + List newIterators = new ArrayList<>(); + boolean allClausesFullyTraversed = true; + + for (int i = 0; i < clauseWeights.size(); i++) { + Weight weight = clauseWeights.get(i); + ScorerSupplier supplier = cachedSuppliers.get(i); // Use cached supplier + Query query = weight.getQuery(); + + if (query instanceof ApproximatePointRangeQuery approxQuery) { + // For approximatable queries, try to use the window size + // Temporarily set the size + int originalSize = approxQuery.getSize(); + approxQuery.setSize(windowSize); + + try { + Scorer scorer = supplier.get(windowSize); + if (scorer == null) { + // Clause is fully traversed, end entire conjunction + return null; + } + newIterators.add(scorer.iterator()); + + // Check if this clause has been fully traversed + if (!approxQuery.getFullyTraversed()) { + allClausesFullyTraversed = false; + } + } finally { + // Restore original size + approxQuery.setSize(originalSize); + } + } else { + // Regular queries use full cost - always fully traversed + Scorer scorer = supplier.get(supplier.cost()); + newIterators.add(scorer.iterator()); + } + } + + // If all approximatable clauses are fully traversed, we still have valid scorers + // Don't return null - we have valid scorers that contain all the data + + return newIterators; + } + + @Override + public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { + collector.setScorer(scorer); + + // Check if we need to expand window + if (totalCollected < size && (globalConjunction == null || globalConjunction.docID() == DocIdSetIterator.NO_MORE_DOCS)) { + currentWindowSize *= scalingFactor; + + // Rebuild iterators with new window size + List newIterators = rebuildIteratorsWithWindowSize(currentWindowSize); + if (newIterators == null) { + // A clause is fully traversed, end conjunction + return DocIdSetIterator.NO_MORE_DOCS; + } + globalConjunction = ConjunctionUtils.intersectIterators(newIterators); + + // Return first docID from new conjunction to reset min + int firstDoc = globalConjunction.nextDoc(); + if (firstDoc != DocIdSetIterator.NO_MORE_DOCS) { + return firstDoc; // CancellableBulkScorer will use this as new min + } + } + + // Score existing conjunction within [min, max) range + int result = scoreExistingConjunction(collector, acceptDocs, min, max); + + return result; + } + + private int scoreExistingConjunction(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { + if (globalConjunction == null) { + return DocIdSetIterator.NO_MORE_DOCS; + } + + // Position the iterator correctly + if (globalConjunction.docID() < min) { + if (globalConjunction.docID() == min - 1) { + globalConjunction.nextDoc(); + } else { + globalConjunction.advance(min); + } + } + + int collected = 0; + int doc = -1; + + // Score documents in the range [min, max) + for (doc = globalConjunction.docID(); doc < max; doc = globalConjunction.nextDoc()) { + if (totalCollected >= size) { + return DocIdSetIterator.NO_MORE_DOCS; // Early termination + } + + // BitSet duplicate detection - only collect if not already collected + if (!collectedDocs.get(doc) && (acceptDocs == null || acceptDocs.get(doc))) { + collectedDocs.set(doc); // Mark as collected + collector.collect(doc); + collected++; + totalCollected++; + } else if (collectedDocs.get(doc)) { + } + } + + // Check if conjunction exhausted + if (globalConjunction.docID() == DocIdSetIterator.NO_MORE_DOCS) { + // If we need more hits, expand immediately + if (totalCollected < size) { + int oldWindowSize = currentWindowSize; + currentWindowSize *= scalingFactor; + + try { + List newIterators = rebuildIteratorsWithWindowSize(currentWindowSize); + if (newIterators == null) { + // A clause is fully traversed, restore window size and end conjunction + currentWindowSize = oldWindowSize; + return DocIdSetIterator.NO_MORE_DOCS; + } + + // Expansion succeeded + globalConjunction = ConjunctionUtils.intersectIterators(newIterators); + + // Start fresh from beginning of new conjunction + int firstDoc = globalConjunction.nextDoc(); + if (firstDoc != DocIdSetIterator.NO_MORE_DOCS) { + return firstDoc; // Return new starting point to reset min + } + return firstDoc; // Return new starting point + + } catch (IOException e) {} + } + } + + return globalConjunction.docID(); + + } + + @Override + public long cost() { + return ApproximateBooleanScorerSupplier.this.cost(); + } + }; + + } +} diff --git a/server/src/main/java/org/opensearch/search/approximate/ApproximatePointRangeQuery.java b/server/src/main/java/org/opensearch/search/approximate/ApproximatePointRangeQuery.java index 046eb4dc1c86f..2755c35a9d4b2 100644 --- a/server/src/main/java/org/opensearch/search/approximate/ApproximatePointRangeQuery.java +++ b/server/src/main/java/org/opensearch/search/approximate/ApproximatePointRangeQuery.java @@ -57,6 +57,9 @@ public class ApproximatePointRangeQuery extends ApproximateQuery { private SortOrder sortOrder; public PointRangeQuery pointRangeQuery; private final Function valueToString; + private boolean isTopLevel = true; // Default to true, set to false when nested in boolean query + + private boolean hasBeenFullyTraversed = false; public ApproximatePointRangeQuery( String field, @@ -96,6 +99,14 @@ public void setSize(int size) { this.size = size; } + public boolean isTopLevel() { + return this.isTopLevel; + } + + public void setTopLevel(boolean isTopLevel) { + this.isTopLevel = isTopLevel; + } + public SortOrder getSortOrder() { return this.sortOrder; } @@ -104,6 +115,10 @@ public void setSortOrder(SortOrder sortOrder) { this.sortOrder = sortOrder; } + public boolean getFullyTraversed() { + return hasBeenFullyTraversed; + } + @Override public Query rewrite(IndexSearcher indexSearcher) throws IOException { return super.rewrite(indexSearcher); @@ -173,7 +188,6 @@ public void grow(int count) { @Override public void visit(int docID) { - // it is possible that size < 1024 and docCount < size but we will continue to count through all the 1024 docs adder.add(docID); docCount[0]++; } @@ -242,7 +256,8 @@ private boolean checkValidPointValues(PointValues values) throws IOException { private void intersectLeft(PointValues.PointTree pointTree, PointValues.IntersectVisitor visitor, long[] docCount) throws IOException { - intersectLeft(visitor, pointTree, docCount); + intersectLeft(visitor, pointTree, docCount); // Top-level call + // Only assert for complete traversals (top-level calls) assert pointTree.moveToParent() == false; } @@ -346,6 +361,7 @@ public void intersectRight(PointValues.IntersectVisitor visitor, PointValues.Poi @Override public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + hasBeenFullyTraversed = false; LeafReader reader = context.reader(); long[] docCount = { 0 }; @@ -353,58 +369,133 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti if (checkValidPointValues(values) == false) { return null; } + // values.size(): total points indexed, In most cases: values.size() ≈ number of documents (assuming single-valued fields) - if (size > values.size()) { + if (size > values.size() && isTopLevel) { return pointRangeQueryWeight.scorerSupplier(context); } else { if (sortOrder == null || sortOrder.equals(SortOrder.ASC)) { return new ScorerSupplier() { - - final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values); - final PointValues.IntersectVisitor visitor = getIntersectVisitor(result, docCount); + // Keep a visitor for cost estimation only + DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values); + PointValues.IntersectVisitor visitor = getIntersectVisitor(result, docCount); long cost = -1; + long lastDoc = -1; @Override public Scorer get(long leadCost) throws IOException { - intersectLeft(values.getPointTree(), visitor, docCount); - DocIdSetIterator iterator = result.build().iterator(); - return new ConstantScoreScorer(score(), scoreMode, iterator); + if (!isTopLevel) { + // Use leadCost as dynamic size if it's reasonable, otherwise use original size + int dynamicSize = (leadCost > 0 && leadCost < Integer.MAX_VALUE) ? (int) leadCost : size; + return getWithSize(dynamicSize); + } else { + // For top-level queries, use standard approach + return getWithSize(size); + } + } + + public Scorer getWithSize(int dynamicSize) throws IOException { + // Temporarily update size for this call + int originalSize = size; + size = dynamicSize; + + try { + if (size > values.size()) { + hasBeenFullyTraversed = true; + } + // For windowed approach, create fresh iterator without ResumableDISI state + DocIdSetBuilder freshResult = new DocIdSetBuilder(reader.maxDoc(), values); + long[] freshDocCount = new long[1]; + PointValues.IntersectVisitor freshVisitor = getIntersectVisitor(freshResult, freshDocCount); + + // Always start fresh traversal from root + intersectLeft(values.getPointTree(), freshVisitor, freshDocCount); + + DocIdSetIterator iterator = freshResult.build().iterator(); + lastDoc = iterator.docIDRunEnd(); + return new ConstantScoreScorer(score(), scoreMode, iterator); + } finally { + // Restore original size + size = originalSize; + } } @Override public long cost() { if (cost == -1) { - // Computing the cost may be expensive, so only do it if necessary - cost = values.estimateDocCount(visitor); - assert cost >= 0; + if (isTopLevel) { + // Computing the cost may be expensive, so only do it if necessary + cost = values.estimateDocCount(visitor); + assert cost >= 0; + } else { + return lastDoc != -1 ? lastDoc : values.estimateDocCount(visitor); + } } return cost; } }; } else { + // Descending sort - use intersectRight // we need to fetch size + deleted docs since the collector will prune away deleted docs resulting in fewer results // than expected final int deletedDocs = reader.numDeletedDocs(); - size += deletedDocs; - return new ScorerSupplier() { + int adjustedSize = size + deletedDocs; - final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values); - final PointValues.IntersectVisitor visitor = getIntersectVisitor(result, docCount); + return new ScorerSupplier() { + // Keep a visitor for cost estimation only + DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values); + PointValues.IntersectVisitor visitor = getIntersectVisitor(result, docCount); long cost = -1; + long lastDoc = -1; @Override public Scorer get(long leadCost) throws IOException { - intersectRight(values.getPointTree(), visitor, docCount); - DocIdSetIterator iterator = result.build().iterator(); - return new ConstantScoreScorer(score(), scoreMode, iterator); + if (!isTopLevel) { + // Use leadCost as dynamic size if it's reasonable, otherwise use original size + int dynamicSize = (leadCost > 0 && leadCost < Integer.MAX_VALUE) ? (int) leadCost : adjustedSize; + return getWithSize(dynamicSize); + } else { + // For top-level queries, use standard approach + return getWithSize(adjustedSize); + } + } + + public Scorer getWithSize(int dynamicSize) throws IOException { + // Temporarily update size for this call + int originalSize = size; + size = dynamicSize; + + try { + if (size > values.size()) { + hasBeenFullyTraversed = true; + } + // For windowed approach, create fresh iterator without ResumableDISI state + DocIdSetBuilder freshResult = new DocIdSetBuilder(reader.maxDoc(), values); + long[] freshDocCount = new long[1]; + PointValues.IntersectVisitor freshVisitor = getIntersectVisitor(freshResult, freshDocCount); + + // Always start fresh traversal from root using intersectRight for descending + intersectRight(values.getPointTree(), freshVisitor, freshDocCount); + + DocIdSetIterator iterator = freshResult.build().iterator(); + lastDoc = iterator.docIDRunEnd(); + return new ConstantScoreScorer(score(), scoreMode, iterator); + } finally { + // Restore original size + size = originalSize; + } } @Override public long cost() { if (cost == -1) { - // Computing the cost may be expensive, so only do it if necessary - cost = values.estimateDocCount(visitor); - assert cost >= 0; + if (isTopLevel) { + // Computing the cost may be expensive, so only do it if necessary + cost = values.estimateDocCount(visitor); + assert cost >= 0; + } else { + return lastDoc != -1 ? lastDoc : values.estimateDocCount(visitor); + } } return cost; } diff --git a/server/src/main/java/org/opensearch/search/approximate/ApproximateScoreQuery.java b/server/src/main/java/org/opensearch/search/approximate/ApproximateScoreQuery.java index be1b6eed5333d..b561655c39695 100644 --- a/server/src/main/java/org/opensearch/search/approximate/ApproximateScoreQuery.java +++ b/server/src/main/java/org/opensearch/search/approximate/ApproximateScoreQuery.java @@ -9,6 +9,7 @@ package org.opensearch.search.approximate; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; @@ -50,6 +51,7 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { } Query rewritten = resolvedQuery.rewrite(indexSearcher); if (rewritten != resolvedQuery) { + // To make sure that query goes through entire rewrite process resolvedQuery = rewritten; } return this; @@ -57,7 +59,24 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { public void setContext(SearchContext context) { resolvedQuery = approximationQuery.canApproximate(context) ? approximationQuery : originalQuery; - }; + + if ((resolvedQuery instanceof BooleanQuery) || (resolvedQuery instanceof ApproximateBooleanQuery)) { + resolvedQuery = ApproximateBooleanQuery.boolRewrite(resolvedQuery, context.searcher()); + } + + if (resolvedQuery instanceof ApproximateBooleanQuery approximateBool) { + for (BooleanClause boolClause : approximateBool.boolQuery.clauses()) { + if (boolClause.query() instanceof ApproximateScoreQuery approximateQuery) { + if (approximateQuery.resolvedQuery instanceof ApproximateBooleanQuery boolQuery) { + boolQuery.setTopLevel(false); + } else if (approximateQuery.resolvedQuery instanceof ApproximatePointRangeQuery pointQuery) { + pointQuery.setTopLevel(false); + } + approximateQuery.setContext(context); + } + } + } + } @Override public String toString(String s) { diff --git a/server/src/test/java/org/opensearch/index/query/BoolQueryBuilderTests.java b/server/src/test/java/org/opensearch/index/query/BoolQueryBuilderTests.java index 85e1d0f00c661..26b61880df929 100644 --- a/server/src/test/java/org/opensearch/index/query/BoolQueryBuilderTests.java +++ b/server/src/test/java/org/opensearch/index/query/BoolQueryBuilderTests.java @@ -50,6 +50,7 @@ import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.core.xcontent.XContentParseException; import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.search.approximate.ApproximateBooleanQuery; import org.opensearch.search.approximate.ApproximateMatchAllQuery; import org.opensearch.search.approximate.ApproximateScoreQuery; import org.opensearch.search.internal.ContextIndexSearcher; @@ -119,8 +120,14 @@ protected void doAssertLuceneQuery(BoolQueryBuilder queryBuilder, Query query, Q assertThat(query, instanceOf(ApproximateScoreQuery.class)); assertThat(((ApproximateScoreQuery) query).getOriginalQuery(), instanceOf(MatchAllDocsQuery.class)); } else if (query instanceof MatchNoDocsQuery == false) { - assertThat(query, instanceOf(BooleanQuery.class)); - BooleanQuery booleanQuery = (BooleanQuery) query; + BooleanQuery booleanQuery; + if (query instanceof ApproximateScoreQuery) { // true for single clause cases + assertThat(((ApproximateScoreQuery) query).getOriginalQuery(), instanceOf(BooleanQuery.class)); + booleanQuery = (BooleanQuery) ((ApproximateScoreQuery) query).getOriginalQuery(); + } else { + assertThat(query, instanceOf(BooleanQuery.class)); + booleanQuery = (BooleanQuery) query; + } if (queryBuilder.adjustPureNegative()) { boolean isNegative = true; for (BooleanClause clause : clauses) { @@ -210,14 +217,14 @@ public void testMinShouldMatchFilterWithoutShouldClauses() throws Exception { BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder(); boolQueryBuilder.filter(new BoolQueryBuilder().must(new MatchAllQueryBuilder())); Query query = boolQueryBuilder.toQuery(createShardContext()); - assertThat(query, instanceOf(BooleanQuery.class)); - BooleanQuery booleanQuery = (BooleanQuery) query; + assertThat(((ApproximateScoreQuery) query).getApproximationQuery(), instanceOf(ApproximateBooleanQuery.class)); + BooleanQuery booleanQuery = (BooleanQuery) ((ApproximateScoreQuery) query).getOriginalQuery(); assertThat(booleanQuery.getMinimumNumberShouldMatch(), equalTo(0)); assertThat(booleanQuery.clauses().size(), equalTo(1)); BooleanClause booleanClause = booleanQuery.clauses().get(0); assertThat(booleanClause.occur(), equalTo(BooleanClause.Occur.FILTER)); - assertThat(booleanClause.query(), instanceOf(BooleanQuery.class)); - BooleanQuery innerBooleanQuery = (BooleanQuery) booleanClause.query(); + assertThat(((ApproximateScoreQuery) booleanClause.query()).getOriginalQuery(), instanceOf(BooleanQuery.class)); + BooleanQuery innerBooleanQuery = (BooleanQuery) ((ApproximateScoreQuery) booleanClause.query()).getOriginalQuery(); // we didn't set minimum should match initially, there are no should clauses so it should be 0 assertThat(innerBooleanQuery.getMinimumNumberShouldMatch(), equalTo(0)); assertThat(innerBooleanQuery.clauses().size(), equalTo(1)); diff --git a/server/src/test/java/org/opensearch/index/query/MultiMatchQueryBuilderTests.java b/server/src/test/java/org/opensearch/index/query/MultiMatchQueryBuilderTests.java index d352d54b6f02a..ce7fdaf84f8ac 100644 --- a/server/src/test/java/org/opensearch/index/query/MultiMatchQueryBuilderTests.java +++ b/server/src/test/java/org/opensearch/index/query/MultiMatchQueryBuilderTests.java @@ -55,6 +55,7 @@ import org.opensearch.index.query.MultiMatchQueryBuilder.Type; import org.opensearch.index.search.MatchQuery; import org.opensearch.lucene.queries.ExtendedCommonTermsQuery; +import org.opensearch.search.approximate.ApproximateBooleanQuery; import org.opensearch.search.approximate.ApproximateScoreQuery; import org.opensearch.test.AbstractQueryTestCase; @@ -562,6 +563,7 @@ public void testWithStopWords() throws Exception { query = new BoolQueryBuilder().should(new MultiMatchQueryBuilder("the").field(TEXT_FIELD_NAME).analyzer("stop")) .toQuery(createShardContext()); expected = new BooleanQuery.Builder().add(new MatchNoDocsQuery(), BooleanClause.Occur.SHOULD).build(); + expected = new ApproximateScoreQuery(expected, new ApproximateBooleanQuery((BooleanQuery) expected)); assertEquals(expected, query); query = new BoolQueryBuilder().should( @@ -571,6 +573,7 @@ public void testWithStopWords() throws Exception { new DisjunctionMaxQuery(Arrays.asList(new MatchNoDocsQuery(), new MatchNoDocsQuery()), 0f), BooleanClause.Occur.SHOULD ).build(); + expected = new ApproximateScoreQuery(expected, new ApproximateBooleanQuery((BooleanQuery) expected)); assertEquals(expected, query); } diff --git a/server/src/test/java/org/opensearch/index/query/QueryStringQueryBuilderTests.java b/server/src/test/java/org/opensearch/index/query/QueryStringQueryBuilderTests.java index ea31d2680d4ec..6a4034ee0ae40 100644 --- a/server/src/test/java/org/opensearch/index/query/QueryStringQueryBuilderTests.java +++ b/server/src/test/java/org/opensearch/index/query/QueryStringQueryBuilderTests.java @@ -77,6 +77,7 @@ import org.opensearch.index.mapper.MapperService; import org.opensearch.index.search.QueryStringQueryParser; import org.opensearch.lucene.queries.BlendedTermQuery; +import org.opensearch.search.approximate.ApproximateBooleanQuery; import org.opensearch.search.approximate.ApproximatePointRangeQuery; import org.opensearch.search.approximate.ApproximateScoreQuery; import org.opensearch.test.AbstractQueryTestCase; @@ -1454,6 +1455,7 @@ public void testWithStopWords() throws Exception { query = new BoolQueryBuilder().should(new QueryStringQueryBuilder("the").field(TEXT_FIELD_NAME).analyzer("stop")) .toQuery(createShardContext()); expected = new BooleanQuery.Builder().add(new BooleanQuery.Builder().build(), BooleanClause.Occur.SHOULD).build(); + expected = new ApproximateScoreQuery(expected, new ApproximateBooleanQuery((BooleanQuery) expected)); assertEquals(expected, query); query = new BoolQueryBuilder().should( diff --git a/server/src/test/java/org/opensearch/index/query/SimpleQueryStringBuilderTests.java b/server/src/test/java/org/opensearch/index/query/SimpleQueryStringBuilderTests.java index 0edd387ea9c6f..48080f9a13363 100644 --- a/server/src/test/java/org/opensearch/index/query/SimpleQueryStringBuilderTests.java +++ b/server/src/test/java/org/opensearch/index/query/SimpleQueryStringBuilderTests.java @@ -56,6 +56,8 @@ import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.index.search.SimpleQueryStringQueryParser; +import org.opensearch.search.approximate.ApproximateBooleanQuery; +import org.opensearch.search.approximate.ApproximateScoreQuery; import org.opensearch.test.AbstractQueryTestCase; import java.io.IOException; @@ -775,6 +777,7 @@ public void testWithStopWords() throws Exception { query = new BoolQueryBuilder().should(new SimpleQueryStringBuilder("the").field(TEXT_FIELD_NAME).analyzer("stop")) .toQuery(createShardContext()); expected = new BooleanQuery.Builder().add(new MatchNoDocsQuery(), BooleanClause.Occur.SHOULD).build(); + expected = new ApproximateScoreQuery(expected, new ApproximateBooleanQuery((BooleanQuery) expected)); assertEquals(expected, query); query = new BoolQueryBuilder().should( diff --git a/server/src/test/java/org/opensearch/search/approximate/ApproximateBooleanQueryTests.java b/server/src/test/java/org/opensearch/search/approximate/ApproximateBooleanQueryTests.java new file mode 100644 index 0000000000000..58e05edbcd177 --- /dev/null +++ b/server/src/test/java/org/opensearch/search/approximate/ApproximateBooleanQueryTests.java @@ -0,0 +1,1084 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.search.approximate; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.search.Weight; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.opensearch.index.shard.IndexShard; +import org.opensearch.index.shard.SearchOperationListener; +import org.opensearch.search.aggregations.BucketCollectorProcessor; +import org.opensearch.search.aggregations.SearchContextAggregations; +import org.opensearch.search.fetch.subphase.highlight.SearchHighlightContext; +import org.opensearch.search.internal.ContextIndexSearcher; +import org.opensearch.search.internal.SearchContext; +import org.opensearch.test.OpenSearchTestCase; + +import java.io.IOException; +import java.util.Arrays; +import java.util.concurrent.ExecutorService; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ApproximateBooleanQueryTests extends OpenSearchTestCase { + + // Unit Tests for canApproximate method + public void testCanApproximateWithNullContext() { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(IntPoint.newRangeQuery("field", 1, 100), BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + assertFalse(query.canApproximate(null)); + } + + public void testCanApproximateWithAccurateTotalHits() { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(IntPoint.newRangeQuery("field", 1, 100), BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(SearchContext.TRACK_TOTAL_HITS_ACCURATE); + + assertFalse(query.canApproximate(mockContext)); + } + + public void testCanApproximateWithAggregations() { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(IntPoint.newRangeQuery("field", 1, 100), BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(mock(SearchContextAggregations.class)); + + assertFalse(query.canApproximate(mockContext)); + } + + public void testCanApproximateWithHighlighting() { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(IntPoint.newRangeQuery("field", 1, 100), BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + + SearchHighlightContext mockHighlight = mock(SearchHighlightContext.class); + when(mockHighlight.fields()).thenReturn(Arrays.asList(mock(SearchHighlightContext.Field.class))); + when(mockContext.highlight()).thenReturn(mockHighlight); + + assertFalse(query.canApproximate(mockContext)); + } + + public void testCanApproximateWithValidFilterClauses() { + ApproximateScoreQuery approxQuery1 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field1", 1, 100), + new ApproximatePointRangeQuery( + "field1", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + ApproximateScoreQuery approxQuery2 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field2", 200, 300), + new ApproximatePointRangeQuery( + "field2", + IntPoint.pack(new int[] { 200 }).bytes, + IntPoint.pack(new int[] { 300 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + // Set isTopLevel to false since these are nested in boolean query + ((ApproximatePointRangeQuery) approxQuery1.getApproximationQuery()).setTopLevel(false); + ((ApproximatePointRangeQuery) approxQuery2.getApproximationQuery()).setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + approxQuery1.setContext(mockContext); + approxQuery2.setContext(mockContext); + + assertTrue(query.canApproximate(mockContext)); + } + + public void testCanApproximateWithMustNotClause() { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(IntPoint.newRangeQuery("field1", 1, 100), BooleanClause.Occur.FILTER) + .add(IntPoint.newRangeQuery("field2", 200, 300), BooleanClause.Occur.MUST_NOT) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + assertFalse(query.canApproximate(mockContext)); + } + + // Unit Tests for ScorerSupplier + public void testScorerSupplierCreation() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + // Add test documents + for (int i = 0; i < 20000; i++) { + Document doc = new Document(); + doc.add(new IntPoint("field1", i)); + doc.add(new IntPoint("field2", i * 2)); + doc.add(new NumericDocValuesField("field1", i)); + iw.addDocument(doc); + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + IndexSearcher searcher = new IndexSearcher(reader); + LeafReaderContext leafContext = reader.leaves().get(0); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add( + IntPoint.newRangeQuery("field1", 10, 50), + BooleanClause.Occur.FILTER + ).add(IntPoint.newRangeQuery("field2", 20, 100), BooleanClause.Occur.FILTER).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f); + ScorerSupplier supplier = weight.scorerSupplier(leafContext); + + assertNotNull(supplier); + assertTrue(supplier instanceof ApproximateBooleanScorerSupplier); + + // Test cost estimation + assertTrue(supplier.cost() > 0); + + // Test scorer creation, scorer should be null since nested ApproximateBooleanQueries shouldn't exist + Scorer scorer = supplier.get(1000); + assertNull(scorer); + } + } + } + } + + // Integration test comparing approximate vs exact results + public void testApproximateVsExactResults() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + int numDocs = 12000; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(new IntPoint("field1", i % 1000)); + doc.add(new IntPoint("field2", (i * 3) % 1000)); + doc.add(new NumericDocValuesField("field1", i)); + iw.addDocument(doc); + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + IndexSearcher searcher = new IndexSearcher(reader); + + int lower1 = 200; + int upper1 = 400; + int lower2 = 300; + int upper2 = 500; + + // Create approximate query + ApproximatePointRangeQuery approxQuery1 = new ApproximatePointRangeQuery( + "field1", + IntPoint.pack(new int[] { lower1 }).bytes, + IntPoint.pack(new int[] { upper1 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery1.setTopLevel(false); + ApproximatePointRangeQuery approxQuery2 = new ApproximatePointRangeQuery( + "field2", + IntPoint.pack(new int[] { lower2 }).bytes, + IntPoint.pack(new int[] { upper2 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + + approxQuery2.setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .build(); + + ApproximateBooleanQuery approximateQuery = new ApproximateBooleanQuery(boolQuery); + + // Create exact query (same boolean structure) + Query exactQuery = boolQuery; + + // Search with both queries + TopDocs approximateDocs = searcher.search(approximateQuery, 1000); + TopDocs exactDocs = searcher.search(exactQuery, 1000); + + // Results should be identical when approximation is not triggered + // or when we collect all available documents + if (exactDocs.totalHits.value() <= 1000) { + assertEquals( + "Approximate and exact should return same number of docs when under limit", + exactDocs.totalHits.value(), + approximateDocs.totalHits.value() + ); + } + } + } + } + } + + // Test with single clause (nested ApproximateScoreQuery case) + public void testSingleClauseApproximation() { + ApproximatePointRangeQuery pointQuery = new ApproximatePointRangeQuery( + "field", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.LONG_FORMAT + ); + pointQuery.setTopLevel(false); // Set as non-top-level since it's nested + + ApproximateScoreQuery scoreQuery = new ApproximateScoreQuery(IntPoint.newRangeQuery("field", 1, 100), pointQuery); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(scoreQuery, BooleanClause.Occur.MUST).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + scoreQuery.setContext(mockContext); + + // Should delegate to nested query's canApproximate + boolean result = query.canApproximate(mockContext); + assertTrue(result); + } + + // Test BoolQueryBuilder pattern: All FILTER clauses (multi-clause) + public void testAllFilterClausesCanApproximate() { + // Create approximatable range queries manually + ApproximateScoreQuery approxQuery1 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field1", 1, 100), + new ApproximatePointRangeQuery( + "field1", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + ApproximateScoreQuery approxQuery2 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field2", 200, 300), + new ApproximatePointRangeQuery( + "field2", + IntPoint.pack(new int[] { 200 }).bytes, + IntPoint.pack(new int[] { 300 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + ApproximateScoreQuery approxQuery3 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field3", 400, 500), + new ApproximatePointRangeQuery( + "field3", + IntPoint.pack(new int[] { 400 }).bytes, + IntPoint.pack(new int[] { 500 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + // Set isTopLevel to false since these are nested in boolean query + ((ApproximatePointRangeQuery) approxQuery1.getApproximationQuery()).setTopLevel(false); + ((ApproximatePointRangeQuery) approxQuery2.getApproximationQuery()).setTopLevel(false); + ((ApproximatePointRangeQuery) approxQuery3.getApproximationQuery()).setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .add(approxQuery3, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + approxQuery1.setContext(mockContext); + approxQuery2.setContext(mockContext); + approxQuery3.setContext(mockContext); + + assertTrue("All FILTER clauses should be approximatable", query.canApproximate(mockContext)); + } + + public void testSingleClauseMustCanApproximate() { + ApproximateScoreQuery approxQuery = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field", 1, 100), + new ApproximatePointRangeQuery( + "field", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + // Set isTopLevel to false since it's nested in boolean query + ((ApproximatePointRangeQuery) approxQuery.getApproximationQuery()).setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery, BooleanClause.Occur.MUST).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + approxQuery.setContext(mockContext); + + // Single clause with MUST should return false (not handled by current logic) + assertTrue("Single MUST clause should be approximatable", query.canApproximate(mockContext)); + } + + public void testSingleClauseShouldCanApproximate() { + ApproximateScoreQuery approxQuery = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field", 1, 100), + new ApproximatePointRangeQuery("field", new byte[] { 1 }, new byte[] { 100 }, 1, ApproximatePointRangeQuery.INT_FORMAT) + ); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery, BooleanClause.Occur.SHOULD).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + approxQuery.setContext(mockContext); + + // Single clause with SHOULD should be approximatable with ApproximateScoreQuery + assertTrue("Single SHOULD clause with ApproximateScoreQuery should be approximatable", query.canApproximate(mockContext)); + } + + public void testSingleClauseFilterCanApproximate() { + ApproximateScoreQuery approxQuery = new ApproximateScoreQuery( + IntPoint.newRangeQuery("field", 1, 100), + new ApproximatePointRangeQuery( + "field", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery, BooleanClause.Occur.FILTER).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + approxQuery.setContext(mockContext); + + // Single clause with FILTER should approximate + assertTrue("Single FILTER clause should be approximatable", query.canApproximate(mockContext)); + } + + // Test BoolQueryBuilder pattern: Single clause WITH ApproximateScoreQuery wrapper + public void testSingleClauseWithApproximateScoreQueryCanApproximate() { + // Create ApproximateScoreQuery wrapper (as BoolQueryBuilder would) + ApproximatePointRangeQuery approxQuery = new ApproximatePointRangeQuery( + "field", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.LONG_FORMAT + ); + approxQuery.setTopLevel(false); + ApproximateScoreQuery scoreQuery = new ApproximateScoreQuery(IntPoint.newRangeQuery("field", 1, 100), approxQuery); + + // Test all single clause types (MUST, SHOULD, FILTER) - all should work + BooleanClause.Occur[] occurs = { BooleanClause.Occur.MUST, BooleanClause.Occur.SHOULD, BooleanClause.Occur.FILTER }; + + for (BooleanClause.Occur occur : occurs) { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(scoreQuery, occur).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + // Single clause with ApproximateScoreQuery should delegate to nested query + boolean result = query.canApproximate(mockContext); + assertTrue("Single " + occur + " clause with ApproximateScoreQuery should be approximatable", result); + } + } + + // Test single MUST_NOT clause should NOT be approximatable + public void testSingleClauseMustNotCannotApproximate() { + ApproximatePointRangeQuery approxQuery = new ApproximatePointRangeQuery( + "field", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.LONG_FORMAT + ); + approxQuery.setTopLevel(false); + ApproximateScoreQuery scoreQuery = new ApproximateScoreQuery(IntPoint.newRangeQuery("field", 1, 100), approxQuery); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(scoreQuery, BooleanClause.Occur.MUST_NOT).build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + // Single MUST_NOT clause should be blocked + assertFalse("Single MUST_NOT clause should not be approximatable", query.canApproximate(mockContext)); + } + + public void testNestedSingleClauseWithApproximateScoreQuery() { + // Create inner ApproximateScoreQuery manually (verbose version) + ApproximatePointRangeQuery innerApproxQuery = new ApproximatePointRangeQuery( + "inner_field", + IntPoint.pack(new int[] { 50 }).bytes, + IntPoint.pack(new int[] { 150 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + innerApproxQuery.setTopLevel(false); + ApproximateScoreQuery innerScoreQuery = new ApproximateScoreQuery(IntPoint.newRangeQuery("inner_field", 50, 150), innerApproxQuery); + + // Inner boolean query (single clause) + BooleanQuery innerBoolQuery = new BooleanQuery.Builder().add(innerScoreQuery, BooleanClause.Occur.FILTER).build(); + + ApproximateBooleanQuery innerApproxBoolQuery = new ApproximateBooleanQuery(innerBoolQuery); + ApproximateScoreQuery outerScoreQuery = new ApproximateScoreQuery(innerBoolQuery, innerApproxBoolQuery); + + // Outer boolean query (single clause containing nested) + BooleanQuery outerBoolQuery = new BooleanQuery.Builder().add(outerScoreQuery, BooleanClause.Occur.MUST).build(); + ApproximateBooleanQuery outerQuery = new ApproximateBooleanQuery(outerBoolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + outerScoreQuery.setContext(mockContext); + innerScoreQuery.setContext(mockContext); + + // Should delegate to nested ApproximateBooleanQuery + boolean result = outerQuery.canApproximate(mockContext); + assertTrue("Nested single clause should follow inner query logic and be approximatable", result); + } + + // Test nested boolean query with ApproximateScoreQuery wrapper (multi-clause pattern) + public void testNestedMultiClauseWithApproximateScoreQuery() { + // Create inner ApproximateScoreQuery instances manually + ApproximateScoreQuery innerQuery1 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("inner_field1", 50, 150), + new ApproximatePointRangeQuery( + "inner_field1", + IntPoint.pack(new int[] { 50 }).bytes, + IntPoint.pack(new int[] { 150 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + ApproximateScoreQuery innerQuery2 = new ApproximateScoreQuery( + IntPoint.newRangeQuery("inner_field2", 200, 300), + new ApproximatePointRangeQuery( + "inner_field2", + IntPoint.pack(new int[] { 200 }).bytes, + IntPoint.pack(new int[] { 300 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + // Inner boolean query (all FILTER clauses) + BooleanQuery innerBoolQuery = new BooleanQuery.Builder().add(innerQuery1, BooleanClause.Occur.FILTER) + .add(innerQuery2, BooleanClause.Occur.FILTER) + .build(); + + ApproximateBooleanQuery innerApproxQuery = new ApproximateBooleanQuery(innerBoolQuery); + ApproximateScoreQuery scoreQuery = new ApproximateScoreQuery(innerBoolQuery, innerApproxQuery); + + // Create outer ApproximateScoreQuery manually + ApproximateScoreQuery outerFieldQuery = new ApproximateScoreQuery( + IntPoint.newRangeQuery("outer_field", 1, 100), + new ApproximatePointRangeQuery( + "outer_field", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + // Outer boolean query (multi-clause with nested) + BooleanQuery outerBoolQuery = new BooleanQuery.Builder().add(scoreQuery, BooleanClause.Occur.FILTER) + .add(outerFieldQuery, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery outerQuery = new ApproximateBooleanQuery(outerBoolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + assertFalse("Nested multi-FILTER clause should not be approximatable", outerQuery.canApproximate(mockContext)); + } + + // Test mixed clause types (should not be approximatable) + public void testMixedClauseTypesCannotApproximate() { + BooleanQuery boolQuery = new BooleanQuery.Builder().add(IntPoint.newRangeQuery("field1", 1, 100), BooleanClause.Occur.FILTER) + .add(IntPoint.newRangeQuery("field2", 200, 300), BooleanClause.Occur.MUST) + .add(IntPoint.newRangeQuery("field3", 400, 500), BooleanClause.Occur.SHOULD) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + assertFalse("Mixed clause types should not be approximatable", query.canApproximate(mockContext)); + } + + // Test deeply nested boolean queries + public void testDeeplyNestedBooleanQueries() { + // Level 3 (deepest) - Create ApproximateScoreQuery manually + ApproximateScoreQuery deep1Query = new ApproximateScoreQuery( + IntPoint.newRangeQuery("deep_field1", 1, 50), + new ApproximatePointRangeQuery( + "deep_field1", + IntPoint.pack(new int[] { 1 }).bytes, + IntPoint.pack(new int[] { 50 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + ApproximateScoreQuery deep2Query = new ApproximateScoreQuery( + IntPoint.newRangeQuery("deep_field2", 51, 100), + new ApproximatePointRangeQuery( + "deep_field2", + IntPoint.pack(new int[] { 51 }).bytes, + IntPoint.pack(new int[] { 100 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + BooleanQuery level3Query = new BooleanQuery.Builder().add(deep1Query, BooleanClause.Occur.FILTER) + .add(deep2Query, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery level3Approx = new ApproximateBooleanQuery(level3Query); + ApproximateScoreQuery level3Score = new ApproximateScoreQuery(level3Query, level3Approx); + + // Level 2 (middle) + ApproximateScoreQuery midQuery = new ApproximateScoreQuery( + IntPoint.newRangeQuery("mid_field", 200, 300), + new ApproximatePointRangeQuery( + "mid_field", + IntPoint.pack(new int[] { 200 }).bytes, + IntPoint.pack(new int[] { 300 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + BooleanQuery level2Query = new BooleanQuery.Builder().add(level3Score, BooleanClause.Occur.FILTER) + .add(midQuery, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery level2Approx = new ApproximateBooleanQuery(level2Query); + ApproximateScoreQuery level2Score = new ApproximateScoreQuery(level2Query, level2Approx); + + // Level 1 (top) + ApproximateScoreQuery topFieldQuery = new ApproximateScoreQuery( + IntPoint.newRangeQuery("top_field", 400, 500), + new ApproximatePointRangeQuery( + "top_field", + IntPoint.pack(new int[] { 400 }).bytes, + IntPoint.pack(new int[] { 500 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ) + ); + + BooleanQuery level1Query = new BooleanQuery.Builder().add(level2Score, BooleanClause.Occur.FILTER) + .add(topFieldQuery, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery topQuery = new ApproximateBooleanQuery(level1Query); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + assertFalse("Deeply nested all-FILTER queries should not be approximatable", topQuery.canApproximate(mockContext)); + } + + // Test edge case: nested query with highlighting should be blocked + public void testNestedQueryWithHighlightingBlocked() { + // Inner boolean query (all FILTER clauses) + BooleanQuery innerBoolQuery = new BooleanQuery.Builder().add( + IntPoint.newRangeQuery("inner_field1", 50, 150), + BooleanClause.Occur.FILTER + ).add(IntPoint.newRangeQuery("inner_field2", 200, 300), BooleanClause.Occur.FILTER).build(); + + ApproximateBooleanQuery innerApproxQuery = new ApproximateBooleanQuery(innerBoolQuery); + ApproximateScoreQuery scoreQuery = new ApproximateScoreQuery(innerBoolQuery, innerApproxQuery); + + // Outer boolean query + BooleanQuery outerBoolQuery = new BooleanQuery.Builder().add(scoreQuery, BooleanClause.Occur.FILTER).build(); + ApproximateBooleanQuery outerQuery = new ApproximateBooleanQuery(outerBoolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + + // Add highlighting + SearchHighlightContext mockHighlight = mock(SearchHighlightContext.class); + when(mockHighlight.fields()).thenReturn(Arrays.asList(mock(SearchHighlightContext.Field.class))); + when(mockContext.highlight()).thenReturn(mockHighlight); + + assertFalse("Nested queries with highlighting should be blocked", outerQuery.canApproximate(mockContext)); + } + + // Test edge case: nested query with one level having MUST_NOT + public void testNestedQueryWithMustNotClause() { + // Inner boolean query (contains MUST_NOT) + BooleanQuery innerBoolQuery = new BooleanQuery.Builder().add( + IntPoint.newRangeQuery("inner_field1", 50, 150), + BooleanClause.Occur.FILTER + ).add(IntPoint.newRangeQuery("inner_field2", 200, 300), BooleanClause.Occur.MUST_NOT).build(); + + ApproximateBooleanQuery innerApproxQuery = new ApproximateBooleanQuery(innerBoolQuery); + ApproximateScoreQuery scoreQuery = new ApproximateScoreQuery(innerBoolQuery, innerApproxQuery); + + // Outer boolean query (all FILTER) + BooleanQuery outerBoolQuery = new BooleanQuery.Builder().add(scoreQuery, BooleanClause.Occur.FILTER) + .add(IntPoint.newRangeQuery("outer_field", 1, 100), BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery outerQuery = new ApproximateBooleanQuery(outerBoolQuery); + + SearchContext mockContext = mock(SearchContext.class); + when(mockContext.trackTotalHitsUpTo()).thenReturn(10000); + when(mockContext.aggregations()).thenReturn(null); + when(mockContext.highlight()).thenReturn(null); + + // Should be blocked by inner MUST_NOT clause + assertFalse("Nested query with MUST_NOT should not be approximatable", outerQuery.canApproximate(mockContext)); + } + + // Test BulkScorer with large dataset to trigger windowed expansion + public void testBulkScorerWindowedExpansion() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + int numDocs = 20000; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(new IntPoint("field1", i)); + doc.add(new IntPoint("field2", i % 1000)); // Create dense overlapping ranges + doc.add(new NumericDocValuesField("field1", i)); + doc.add(new NumericDocValuesField("field2", i % 1000)); + doc.add(new StoredField("field1", i)); + doc.add(new StoredField("field2", i % 1000)); + iw.addDocument(doc); + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + ContextIndexSearcher searcher = createContextIndexSearcher(reader); + + // Create approximate queries directly + ApproximatePointRangeQuery approxQuery1 = new ApproximatePointRangeQuery( + "field1", + IntPoint.pack(new int[] { 1000 }).bytes, + IntPoint.pack(new int[] { 20000 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery1.setTopLevel(false); + ApproximatePointRangeQuery approxQuery2 = new ApproximatePointRangeQuery( + "field2", + IntPoint.pack(new int[] { 100 }).bytes, + IntPoint.pack(new int[] { 900 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery2.setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + TopScoreDocCollector collector = new TopScoreDocCollectorManager(10001, 10001).newCollector(); + searcher.search(query, collector); + TopDocs docs = collector.topDocs(); + + // Should collect documents and potentially expand windows + assertTrue("Should collect some documents", docs.scoreDocs.length > 0); + assertTrue("Should collect up to 10k documents or exhaust", docs.scoreDocs.length <= 10001); + } + } + } + } + + /** + * Creates a ContextIndexSearcher with properly mocked SearchContext for testing. + */ + private ContextIndexSearcher createContextIndexSearcher(IndexReader reader) throws IOException { + SearchContext searchContext = mock(SearchContext.class); + IndexShard indexShard = mock(IndexShard.class); + when(searchContext.indexShard()).thenReturn(indexShard); + SearchOperationListener searchOperationListener = new SearchOperationListener() { + }; + when(indexShard.getSearchOperationListener()).thenReturn(searchOperationListener); + when(searchContext.bucketCollectorProcessor()).thenReturn(new BucketCollectorProcessor()); + when(searchContext.asLocalBucketCountThresholds(any())).thenCallRealMethod(); + + ContextIndexSearcher searcher = new ContextIndexSearcher( + reader, + IndexSearcher.getDefaultSimilarity(), + IndexSearcher.getDefaultQueryCache(), + IndexSearcher.getDefaultQueryCachingPolicy(), + true, + mock(ExecutorService.class), + searchContext + ); + + searcher.addQueryCancellation(() -> {}); + return searcher; + } + + // // Integration test validating hit count and accuracy + public void testApproximateResultsValidation() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + int numDocs = 20000; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + int field1Value = i % 1000; // Values: 0-999 (1000 unique values) + int field2Value = i % 500; // Values: 0-499 (500 unique values) + doc.add(new IntPoint("field1", field1Value)); + doc.add(new IntPoint("field2", field2Value)); + doc.add(new NumericDocValuesField("field1", field1Value)); + doc.add(new NumericDocValuesField("field2", field2Value)); + doc.add(new StoredField("field1", field1Value)); + doc.add(new StoredField("field2", field2Value)); + iw.addDocument(doc); + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + ContextIndexSearcher searcher = createContextIndexSearcher(reader); + + int lower1 = 100; + int upper1 = 200; + int lower2 = 50; + int upper2 = 150; + + // Create approximate query + ApproximatePointRangeQuery approxQuery1 = new ApproximatePointRangeQuery( + "field1", + IntPoint.pack(new int[] { lower1 }).bytes, + IntPoint.pack(new int[] { upper1 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery1.setTopLevel(false); + ApproximatePointRangeQuery approxQuery2 = new ApproximatePointRangeQuery( + "field2", + IntPoint.pack(new int[] { lower2 }).bytes, + IntPoint.pack(new int[] { upper2 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery2.setTopLevel(false); + + BooleanQuery approximateBoolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery approximateQuery = new ApproximateBooleanQuery(approximateBoolQuery); + + // Create exact query (regular Lucene BooleanQuery) + BooleanQuery exactBoolQuery = new BooleanQuery.Builder().add( + IntPoint.newRangeQuery("field1", lower1, upper1), + BooleanClause.Occur.FILTER + ).add(IntPoint.newRangeQuery("field2", lower2, upper2), BooleanClause.Occur.FILTER).build(); + + TopScoreDocCollector collector = new TopScoreDocCollectorManager(10001, 10001).newCollector(); + + searcher.search(approximateQuery, collector); + + // Search with both queries + TopDocs approximateDocs = collector.topDocs(); + + TopScoreDocCollector collectorExact = new TopScoreDocCollectorManager(10001, 10001).newCollector(); + + searcher.search(exactBoolQuery, collectorExact); + + // Search with both queries + TopDocs exactDocs = collectorExact.topDocs(); + + // Validate hit count logic + if (exactDocs.totalHits.value() <= 10000) { + assertEquals( + "When exact results ≤ 10k, approximate should match exactly", + exactDocs.totalHits.value(), + approximateDocs.totalHits.value() + ); + } else { + assertEquals( + "Approximate should return exactly 10k hits when exact > 10k", + 10000, + approximateDocs.totalHits.value() + ); + } + + // Validate hit accuracy - each returned doc should match the query criteria + StoredFields storedFields = reader.storedFields(); + for (int i = 0; i < approximateDocs.scoreDocs.length; i++) { + int docId = approximateDocs.scoreDocs[i].doc; + Document doc = storedFields.document(docId); + + int field1Value = doc.getField("field1").numericValue().intValue(); + int field2Value = doc.getField("field2").numericValue().intValue(); + + assertTrue( + "field1 should be in range [" + lower1 + ", " + upper1 + "], got: " + field1Value, + field1Value >= lower1 && field1Value <= upper1 + ); + assertTrue( + "field2 should be in range [" + lower2 + ", " + upper2 + "], got: " + field2Value, + field2Value >= lower2 && field2Value <= upper2 + ); + } + } + } + } + } + + // Test window size heuristic with different cost scenarios + public void testWindowSizeHeuristic() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(new IntPoint("field1", i)); + doc.add(new IntPoint("field2", i * 2)); + iw.addDocument(doc); + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + ContextIndexSearcher searcher = createContextIndexSearcher(reader); + LeafReaderContext leafContext = reader.leaves().get(0); + + // Create approximate queries directly + ApproximatePointRangeQuery approxQuery1 = new ApproximatePointRangeQuery( + "field1", + IntPoint.pack(new int[] { 100 }).bytes, + IntPoint.pack(new int[] { 900 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery1.setTopLevel(false); + ApproximatePointRangeQuery approxQuery2 = new ApproximatePointRangeQuery( + "field2", + IntPoint.pack(new int[] { 200 }).bytes, + IntPoint.pack(new int[] { 1800 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery2.setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery query = new ApproximateBooleanQuery(boolQuery); + + Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f); + ApproximateBooleanScorerSupplier supplier = (ApproximateBooleanScorerSupplier) weight.scorerSupplier(leafContext); + + assertNotNull(supplier); + } + } + } + } + + // Test sparse data distribution (simulating http_logs dataset) + public void testSparseDataDistribution() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + String fieldName1 = "timestamp"; + String fieldName2 = "status_code"; + + // Create sparse timestamp distribution with dense status code clusters + for (int i = 0; i < 10000; i++) { + Document doc = new Document(); + // Sparse timestamps (gaps in time) + int timestamp = i * 10 + (i % 6); + // Dense status code clusters (200s, 400s, 500s) + int statusCode = (i % 100) < 70 ? 200 + (i % 11) : ((i % 100) < 80 ? 400 + (i % 11) : 500 + (i % 11)); + + doc.add(new IntPoint(fieldName1, timestamp)); + doc.add(new IntPoint(fieldName2, statusCode)); + doc.add(new NumericDocValuesField(fieldName1, timestamp)); + doc.add(new NumericDocValuesField(fieldName2, statusCode)); + doc.add(new StoredField(fieldName1, timestamp)); + doc.add(new StoredField(fieldName2, statusCode)); + iw.addDocument(doc); + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + ContextIndexSearcher searcher = createContextIndexSearcher(reader); + + // Test query for specific time range and status codes + testApproximateQueryValidation(searcher, fieldName1, fieldName2, 10000, 50000, 200, 500, 100); + testApproximateQueryValidation(searcher, fieldName1, fieldName2, 0, 20000, 404, 404, 50); + } + } + } + } + + // Test dense data distribution (simulating nyc_taxis dataset) + public void testDenseDataDistribution() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new WhitespaceAnalyzer())) { + String fieldName1 = "fare_amount"; + String fieldName2 = "trip_distance"; + + // Create dense overlapping distributions + for (int fare = 500; fare <= 5000; fare += 50) { // Dense fare distribution + for (int distance = 1; distance <= 50; distance += 2) { // Dense distance distribution + // Add multiple documents per combination to create density + int numDocs = 3; + for (int d = 0; d < numDocs; d++) { + Document doc = new Document(); + doc.add(new IntPoint(fieldName1, fare)); + doc.add(new IntPoint(fieldName2, distance)); + doc.add(new NumericDocValuesField(fieldName1, fare)); + doc.add(new NumericDocValuesField(fieldName2, distance)); + doc.add(new StoredField(fieldName1, fare)); + doc.add(new StoredField(fieldName2, distance)); + iw.addDocument(doc); + } + } + } + iw.flush(); + + try (IndexReader reader = iw.getReader()) { + ContextIndexSearcher searcher = createContextIndexSearcher(reader); + + // Test queries for different fare and distance ranges + testApproximateQueryValidation(searcher, fieldName1, fieldName2, 1000, 3000, 5, 25, 200); + testApproximateQueryValidation(searcher, fieldName1, fieldName2, 2000, 4000, 10, 40, 500); + } + } + } + } + + public void testApproximateQueryValidation( + ContextIndexSearcher searcher, + String field1, + String field2, + int lower1, + int upper1, + int lower2, + int upper2, + int size + ) throws IOException { + // Create approximate query using ApproximatePointRangeQuery directly + ApproximatePointRangeQuery approxQuery1 = new ApproximatePointRangeQuery( + field1, + IntPoint.pack(new int[] { lower1 }).bytes, + IntPoint.pack(new int[] { upper1 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery1.setTopLevel(false); + ApproximatePointRangeQuery approxQuery2 = new ApproximatePointRangeQuery( + field2, + IntPoint.pack(new int[] { lower2 }).bytes, + IntPoint.pack(new int[] { upper2 }).bytes, + 1, + ApproximatePointRangeQuery.INT_FORMAT + ); + approxQuery2.setTopLevel(false); + + BooleanQuery boolQuery = new BooleanQuery.Builder().add(approxQuery1, BooleanClause.Occur.FILTER) + .add(approxQuery2, BooleanClause.Occur.FILTER) + .build(); + ApproximateBooleanQuery approximateQuery = new ApproximateBooleanQuery(boolQuery); + + TopScoreDocCollector collector = new TopScoreDocCollectorManager(size + 1, size + 1).newCollector(); + searcher.search(approximateQuery, collector); + TopDocs approxDocs = collector.topDocs(); + + // Validate hit count + assertTrue("Approximate query should return at most " + size + 1 + " docs", approxDocs.scoreDocs.length <= size + 1); + assertTrue("Should not exceed 10k hits", approxDocs.totalHits.value() <= 10000); + + // Validate hit accuracy - each returned doc should match the query criteria + StoredFields storedFields = searcher.getIndexReader().storedFields(); + for (int i = 0; i < approxDocs.scoreDocs.length; i++) { + int docId = approxDocs.scoreDocs[i].doc; + Document doc = storedFields.document(docId); + + int field1Value = doc.getField(field1).numericValue().intValue(); + int field2Value = doc.getField(field2).numericValue().intValue(); + + assertTrue( + field1 + " should be in range [" + lower1 + ", " + upper1 + "], got: " + field1Value, + field1Value >= lower1 && field1Value <= upper1 + ); + assertTrue( + field2 + " should be in range [" + lower2 + ", " + upper2 + "], got: " + field2Value, + field2Value >= lower2 && field2Value <= upper2 + ); + } + } +}