From 3fe7d968134d2540f7ce9b84def8b47de640696f Mon Sep 17 00:00:00 2001 From: Harsha Vamsi Kalluri Date: Fri, 20 Mar 2026 17:49:47 +0000 Subject: [PATCH] Fix geotile_grid aggregation CPU stall on large geo_shapes Signed-off-by: Harsha Vamsi Kalluri --- CHANGELOG.md | 1 + .../geogrid/GeoHashGridAggregatorFactory.java | 8 +- .../geogrid/GeoTileGridAggregatorFactory.java | 8 +- .../bucket/geogrid/util/GeoShapeHashUtil.java | 21 +++++ .../geogrid/GeoTileGridAggregatorTests.java | 82 +++++++++++++++++++ .../aggregations/bucket/GeoTileUtils.java | 52 ++++++++++++ .../bucket/geogrid/GeoTileUtilsTests.java | 59 +++++++++++++ 7 files changed, 229 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61e717fbf9171..5ff567d1271d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix terms lookup subquery fetch limit reading from non-existent index setting instead of cluster `max_clause_count` ([#20823](https://github.com/opensearch-project/OpenSearch/pull/20823)) - Fix array_index_out_of_bounds_exception with wildcard and aggregations ([#20842](https://github.com/opensearch-project/OpenSearch/pull/20842)) - Handle dependencies between analyzers ([#19248](https://github.com/opensearch-project/OpenSearch/pull/19248)) +- Fix geotile_grid aggregation CPU stall on large geo_shapes by adding tile limit and cooperative cancellation ([#20929](https://github.com/opensearch-project/OpenSearch/pull/20929)) ### Dependencies - Bump shadow-gradle-plugin from 8.3.9 to 9.3.1 ([#20569](https://github.com/opensearch-project/OpenSearch/pull/20569)) diff --git a/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoHashGridAggregatorFactory.java b/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoHashGridAggregatorFactory.java index 60ee1973c1080..7e63178ffe326 100644 --- a/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoHashGridAggregatorFactory.java +++ b/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoHashGridAggregatorFactory.java @@ -33,6 +33,7 @@ package org.opensearch.geo.search.aggregations.bucket.geogrid; import org.opensearch.common.geo.GeoBoundingBox; +import org.opensearch.core.tasks.TaskCancelledException; import org.opensearch.geo.search.aggregations.bucket.geogrid.cells.CellIdSource; import org.opensearch.geo.search.aggregations.bucket.geogrid.cells.GeoShapeCellIdSource; import org.opensearch.geo.search.aggregations.bucket.geogrid.util.GeoShapeHashUtil; @@ -175,11 +176,16 @@ static void registerAggregators(ValuesSourceRegistry.Builder builder) { parent, cardinality, metadata) -> { + final SearchContext ctx = aggregationContext; final GeoShapeCellIdSource cellIdSource = new GeoShapeCellIdSource( (ValuesSource.GeoShape) valuesSource, precision, geoBoundingBox, - GeoShapeHashUtil::encodeShape + (docValue, p) -> GeoShapeHashUtil.encodeShape(docValue, p, () -> { + if (ctx.isCancelled()) { + throw new TaskCancelledException("Cancelling geohash_grid aggregation on geo_shape field"); + } + }) ); return new GeoHashGridAggregator( name, diff --git a/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorFactory.java b/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorFactory.java index 54b82f9770b63..c628602ee5405 100644 --- a/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorFactory.java +++ b/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorFactory.java @@ -33,6 +33,7 @@ package org.opensearch.geo.search.aggregations.bucket.geogrid; import org.opensearch.common.geo.GeoBoundingBox; +import org.opensearch.core.tasks.TaskCancelledException; import org.opensearch.geo.search.aggregations.bucket.geogrid.cells.CellIdSource; import org.opensearch.geo.search.aggregations.bucket.geogrid.cells.GeoShapeCellIdSource; import org.opensearch.index.query.QueryShardContext; @@ -173,11 +174,16 @@ static void registerAggregators(ValuesSourceRegistry.Builder builder) { parent, cardinality, metadata) -> { + final SearchContext ctx = aggregationContext; GeoShapeCellIdSource cellIdSource = new GeoShapeCellIdSource( (ValuesSource.GeoShape) valuesSource, precision, geoBoundingBox, - GeoTileUtils::encodeShape + (docValue, p) -> GeoTileUtils.encodeShape(docValue, p, () -> { + if (ctx.isCancelled()) { + throw new TaskCancelledException("Cancelling geotile_grid aggregation on geo_shape field"); + } + }) ); return new GeoTileGridAggregator( name, diff --git a/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/util/GeoShapeHashUtil.java b/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/util/GeoShapeHashUtil.java index aefb31e623bb5..1a578e6e5c48f 100644 --- a/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/util/GeoShapeHashUtil.java +++ b/modules/geo/src/main/java/org/opensearch/geo/search/aggregations/bucket/geogrid/util/GeoShapeHashUtil.java @@ -21,6 +21,11 @@ */ public class GeoShapeHashUtil { + /** + * Interval at which we check for cancellation inside the geohash iteration loop. + */ + private static final int CANCELLATION_CHECK_INTERVAL = 1024; + /** * The function encodes the shape provided as {@link GeoShapeDocValue} to a {@link List} of {@link Long} values * (representing the GeoHashes) which are intersecting with the shapes at a given precision. @@ -30,6 +35,18 @@ public class GeoShapeHashUtil { * @return {@link List} containing encoded {@link Long} values */ public static List encodeShape(final GeoShapeDocValue geoShapeDocValue, final int precision) { + return encodeShape(geoShapeDocValue, precision, () -> {}); + } + + /** + * Encodes the shape to a list of intersecting geohash long values, with periodic cancellation checks. + * + * @param geoShapeDocValue {@link GeoShapeDocValue} + * @param precision int + * @param checkCancelled a {@link Runnable} invoked periodically to allow the caller to abort + * @return {@link List} containing encoded {@link Long} values + */ + public static List encodeShape(final GeoShapeDocValue geoShapeDocValue, final int precision, final Runnable checkCancelled) { final List encodedValues = new ArrayList<>(); final GeoShapeDocValue.BoundingRectangle boundingRectangle = geoShapeDocValue.getBoundingRectangle(); long topLeftGeoHash = Geohash.longEncode(boundingRectangle.getMinX(), boundingRectangle.getMaxY(), precision); @@ -39,7 +56,11 @@ public static List encodeShape(final GeoShapeDocValue geoShapeDocValue, fi long currentValue = topLeftGeoHash; long rightMax = topRightGeoHash; long tempCurrent = currentValue; + int iterationCount = 0; while (true) { + if (++iterationCount % CANCELLATION_CHECK_INTERVAL == 0) { + checkCancelled.run(); + } // check if this currentValue intersect with shape. final Rectangle geohashRectangle = Geohash.toBoundingBox(Geohash.stringEncode(tempCurrent)); if (geoShapeDocValue.isIntersectingRectangle(geohashRectangle)) { diff --git a/modules/geo/src/test/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorTests.java b/modules/geo/src/test/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorTests.java index f2f641ea794c0..7e8ba8350ba36 100644 --- a/modules/geo/src/test/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorTests.java +++ b/modules/geo/src/test/java/org/opensearch/geo/search/aggregations/bucket/geogrid/GeoTileGridAggregatorTests.java @@ -32,8 +32,21 @@ package org.opensearch.geo.search.aggregations.bucket.geogrid; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.LatLonShape; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.opensearch.common.geo.GeoShapeUtils; +import org.opensearch.geometry.Line; +import org.opensearch.index.mapper.GeoShapeFieldMapper; +import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.search.aggregations.bucket.GeoTileUtils; +import java.io.IOException; + public class GeoTileGridAggregatorTests extends GeoGridAggregatorTestCase { @Override @@ -61,4 +74,73 @@ public void testPrecision() { builder.precision(precision); assertEquals(precision, builder.precision()); } + + /** + * Test that a LineString spanning large distances doesn't cause excessive tile generation. + * This reproduces bug #20413 where a LineString with a large bounding box can generate + * millions of tiles at high precision, causing CPU to max out and the cluster to stall. + */ + public void testLineStringWithLargeBoundingBoxAtHighPrecision() { + final String fieldName = "location"; + // Create a LineString spanning nearly the entire world diagonally. + // The diagonal span ensures a large bounding box in both x and y tile dimensions. + Line line = new Line(new double[] { -170.0, 170.0 }, new double[] { -60.0, 60.0 }); + + try (Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + Document doc = new Document(); + doc.add(LatLonShape.createDocValueField(fieldName, GeoShapeUtils.toLuceneLine(line))); + writer.addDocument(doc); + + MappedFieldType fieldType = new GeoShapeFieldMapper.GeoShapeFieldType(fieldName); + + // At precision 15 (2^15 = 32768 tiles/axis), this diagonal line's bounding box + // spans ~31000 x-tiles and many y-tiles, well exceeding MAX_TILES_PER_SHAPE (65536) + GeoTileGridAggregationBuilder aggBuilder = new GeoTileGridAggregationBuilder("geotile_grid"); + aggBuilder.field(fieldName).precision(15); + + try (IndexReader reader = writer.getReader()) { + IndexSearcher searcher = new IndexSearcher(reader); + + // With the fix, this should throw an IllegalArgumentException + // instead of hanging indefinitely + Exception exception = expectThrows( + IllegalArgumentException.class, + () -> searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, fieldType) + ); + assertTrue("Exception should mention tile limit", exception.getMessage().contains("would generate too many tiles")); + } + } catch (IOException e) { + fail("IOException during test: " + e.getMessage()); + } + } + + /** + * Test that a LineString with moderate size works correctly at reasonable precision. + */ + public void testLineStringWithModerateBoundingBox() throws IOException { + final String fieldName = "location"; + // Create a smaller LineString that should work fine + Line line = new Line(new double[] { -1.0, 1.0 }, new double[] { 0.0, 0.0 }); + + try (Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + Document doc = new Document(); + doc.add(LatLonShape.createDocValueField(fieldName, GeoShapeUtils.toLuceneLine(line))); + writer.addDocument(doc); + + MappedFieldType fieldType = new GeoShapeFieldMapper.GeoShapeFieldType(fieldName); + + // This should work fine even at high precision + GeoTileGridAggregationBuilder aggBuilder = new GeoTileGridAggregationBuilder("geotile_grid"); + aggBuilder.field(fieldName).precision(10); + + try (IndexReader reader = writer.getReader()) { + IndexSearcher searcher = new IndexSearcher(reader); + GeoTileGrid result = searchAndReduce(searcher, new MatchAllDocsQuery(), aggBuilder, fieldType); + + // Should successfully generate buckets + assertNotNull(result); + assertTrue("Expected buckets to be generated", result.getBuckets().size() > 0); + } + } + } } diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/GeoTileUtils.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/GeoTileUtils.java index dfb8f6be7155d..52f7f786cb82b 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/GeoTileUtils.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/GeoTileUtils.java @@ -81,6 +81,13 @@ private GeoTileUtils() {} */ public static final double LATITUDE_MASK = 85.0511287798066; + /** + * Maximum number of tiles that can be generated for a single shape. + * This prevents excessive CPU usage for shapes with large bounding boxes (e.g., long LineStrings). + * The limit is set to allow reasonable precision while preventing cluster stalls. + */ + public static final int MAX_TILES_PER_SHAPE = 65536; + /** * Since shapes are encoded, their boundaries are to be compared to against the encoded/decoded values of LATITUDE_MASK */ @@ -288,6 +295,11 @@ public static Rectangle toBoundingBox(String hash) { return toBoundingBox(hashAsInts[1], hashAsInts[2], hashAsInts[0]); } + /** + * Interval at which we check for cancellation inside the tile iteration loop. + */ + private static final int CANCELLATION_CHECK_INTERVAL = 1024; + /** * The function encodes the shape provided as {@link GeoShapeDocValue} to a {@link List} of {@link Long} values * (representing the GeoTiles) which are intersecting with the shapes at a given precision. @@ -297,6 +309,19 @@ public static Rectangle toBoundingBox(String hash) { * @return {@link List} of {@link Long} */ public static List encodeShape(final GeoShapeDocValue geoShapeDocValue, final int precision) { + return encodeShape(geoShapeDocValue, precision, () -> {}); + } + + /** + * Encodes the shape to a list of intersecting tile long values, with periodic cancellation checks. + * + * @param geoShapeDocValue {@link GeoShapeDocValue} + * @param precision int + * @param checkCancelled a {@link Runnable} invoked periodically to allow the caller to abort + * (e.g. by throwing a TaskCancelledException) + * @return {@link List} of {@link Long} + */ + public static List encodeShape(final GeoShapeDocValue geoShapeDocValue, final int precision, final Runnable checkCancelled) { final GeoShapeDocValue.BoundingRectangle boundingRectangle = geoShapeDocValue.getBoundingRectangle(); // generate all the grid long values that this shape intersects. final long totalTilesAtPrecision = 1L << checkPrecisionRange(precision); @@ -306,9 +331,36 @@ public static List encodeShape(final GeoShapeDocValue geoShapeDocValue, fi // take maxY and for maxYTile we need to take minY. int minYTile = getYTile(boundingRectangle.getMaxY(), totalTilesAtPrecision); int maxYTile = getYTile(boundingRectangle.getMinY(), totalTilesAtPrecision); + + // Calculate the potential number of tiles to prevent excessive iteration + long xTileRange = (long) maxXTile - minXTile + 1; + long yTileRange = (long) maxYTile - minYTile + 1; + long potentialTiles = xTileRange * yTileRange; + + // Hard limit: reject shapes that would generate an unreasonable number of tiles + if (potentialTiles > MAX_TILES_PER_SHAPE) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Tile aggregation on GeoShape field would generate too many tiles (%d) at precision %d. " + + "The shape spans %d x-tiles and %d y-tiles. Maximum allowed is %d tiles per shape. " + + "Consider using a lower precision or a smaller shape.", + potentialTiles, + precision, + xTileRange, + yTileRange, + MAX_TILES_PER_SHAPE + ) + ); + } + final List encodedValues = new ArrayList<>(); + int iterationCount = 0; for (int x = minXTile; x <= maxXTile; x++) { for (int y = minYTile; y <= maxYTile; y++) { + if (++iterationCount % CANCELLATION_CHECK_INTERVAL == 0) { + checkCancelled.run(); + } // Convert the precision, x , y to encoded value. long encodedValue = longEncodeTiles(precision, x, y); // Convert encoded value to rectangle diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/geogrid/GeoTileUtilsTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/geogrid/GeoTileUtilsTests.java index 1443208a1d2fc..b43fde72d4019 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/geogrid/GeoTileUtilsTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/geogrid/GeoTileUtilsTests.java @@ -33,10 +33,14 @@ package org.opensearch.search.aggregations.bucket.geogrid; import org.opensearch.common.geo.GeoPoint; +import org.opensearch.common.geo.GeoShapeDocValue; +import org.opensearch.geometry.Line; import org.opensearch.geometry.Rectangle; import org.opensearch.search.aggregations.bucket.GeoTileUtils; import org.opensearch.test.OpenSearchTestCase; +import java.util.List; + import static org.opensearch.search.aggregations.bucket.GeoTileUtils.MAX_ZOOM; import static org.opensearch.search.aggregations.bucket.GeoTileUtils.checkPrecisionRange; import static org.opensearch.search.aggregations.bucket.GeoTileUtils.hashToGeoPoint; @@ -47,6 +51,8 @@ import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.lessThan; public class GeoTileUtilsTests extends OpenSearchTestCase { @@ -265,4 +271,57 @@ public void testPointToTile() { assertThat(GeoTileUtils.getYTile(y, tiles), equalTo(yTile)); } + + /** + * Test that encodeShape throws an exception when the shape would generate too many tiles. + * This tests the fix for bug #20413 where a LineString with a large bounding box could + * generate millions of tiles at high precision, causing CPU to max out. + */ + public void testEncodeShapeExceedsTileLimit() { + // Create a LineString spanning most of the world in both longitude and latitude. + // The diagonal span ensures a large bounding box in both x and y tile dimensions. + Line line = new Line(new double[] { -170.0, 170.0 }, new double[] { -60.0, 60.0 }); + GeoShapeDocValue docValue = GeoShapeDocValue.createGeometryDocValue(line); + + // At precision 15 (2^15 = 32768 tiles/axis), this line's bounding box spans + // ~31000 x-tiles and many y-tiles, well exceeding MAX_TILES_PER_SHAPE (65536) + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> GeoTileUtils.encodeShape(docValue, 15)); + assertThat(exception.getMessage(), containsString("would generate too many tiles")); + assertThat(exception.getMessage(), containsString("at precision 15")); + } + + /** + * Test that encodeShape works correctly for shapes within the tile limit. + */ + public void testEncodeShapeWithinTileLimit() { + // Create a small LineString + Line line = new Line(new double[] { -1.0, 1.0 }, new double[] { 0.0, 0.0 }); + GeoShapeDocValue docValue = GeoShapeDocValue.createGeometryDocValue(line); + + // This should work fine even at high precision + List tiles = GeoTileUtils.encodeShape(docValue, 10); + + // Should generate some tiles but not exceed the limit + assertThat(tiles.size(), greaterThan(0)); + assertThat(tiles.size(), lessThan(GeoTileUtils.MAX_TILES_PER_SHAPE)); + } + + /** + * Test that encodeShape respects the cancellation callback. + * This verifies that long-running tile iterations can be cancelled by the search timeout. + */ + public void testEncodeShapeRespectsCancel() { + // Create a moderate LineString that generates enough tiles to trigger the check + Line line = new Line(new double[] { -10.0, 10.0 }, new double[] { -5.0, 5.0 }); + GeoShapeDocValue docValue = GeoShapeDocValue.createGeometryDocValue(line); + + // Use a cancellation callback that throws after being called + RuntimeException cancelException = new RuntimeException("cancelled"); + Runnable alwaysCancel = () -> { throw cancelException; }; + + // At a moderate precision, iteration count should exceed the check interval (1024) + // and the cancellation should fire + RuntimeException thrown = expectThrows(RuntimeException.class, () -> GeoTileUtils.encodeShape(docValue, 10, alwaysCancel)); + assertSame(cancelException, thrown); + } }