diff --git a/CHANGELOG.md b/CHANGELOG.md index 276ba96567449..91421a9068259 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Delegate getMin/getMax methods for ExitableTerms ([#20775](https://github.com/opensearch-project/OpenSearch/pull/20775)) - Fix terms lookup subquery fetch limit reading from non-existent index setting instead of cluster `max_clause_count` ([#20823](https://github.com/opensearch-project/OpenSearch/pull/20823)) - Fix array_index_out_of_bounds_exception with wildcard and aggregations ([#20842](https://github.com/opensearch-project/OpenSearch/pull/20842)) +- Handle dependencies between analyzers ([#19248](https://github.com/opensearch-project/OpenSearch/pull/19248)) ### Dependencies - Bump shadow-gradle-plugin from 8.3.9 to 9.3.1 ([#20569](https://github.com/opensearch-project/OpenSearch/pull/20569)) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java index c2e20e99473de..be5693ddaa02a 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java @@ -72,7 +72,18 @@ public TokenFilterFactory getChainAwareTokenFilterFactory( List previousTokenFilters, Function allFilters ) { - final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters); + return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters, name -> null); + } + + @Override + public TokenFilterFactory getChainAwareTokenFilterFactory( + TokenizerFactory tokenizer, + List charFilters, + List previousTokenFilters, + Function allFilters, + Function analyzersBuiltSoFar + ) { + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar); final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); final String name = name(); return new TokenFilterFactory() { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java index 1cd78170e66c8..ff644e0232b2e 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java @@ -114,7 +114,18 @@ public TokenFilterFactory getChainAwareTokenFilterFactory( List previousTokenFilters, Function allFilters ) { - final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters); + return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters, name -> null); + } + + @Override + public TokenFilterFactory getChainAwareTokenFilterFactory( + TokenizerFactory tokenizer, + List charFilters, + List previousTokenFilters, + Function allFilters, + Function analyzersBuiltSoFar + ) { + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar); final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); final String name = name(); return new TokenFilterFactory() { @@ -147,10 +158,15 @@ Analyzer buildSynonymAnalyzer( TokenizerFactory tokenizer, List charFilters, List tokenFilters, - Function allFilters + Function allFilters, + Function analyzersBuiltSoFar ) { if (synonymAnalyzerName != null) { - Analyzer customSynonymAnalyzer; + // first, check settings analyzers + Analyzer customSynonymAnalyzer = analyzersBuiltSoFar.apply(synonymAnalyzerName); + if (customSynonymAnalyzer != null) { + return customSynonymAnalyzer; + } try { customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName); } catch (IOException e) { diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/EdgeNGramTokenizerTests.java index 7681a3f6626e3..744954c3e0daf 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/EdgeNGramTokenizerTests.java @@ -49,11 +49,9 @@ import java.io.IOException; import java.io.StringReader; +import java.util.Arrays; import java.util.Collections; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.hasToString; - public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase { private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException { @@ -99,7 +97,14 @@ public void testPreConfiguredTokenizer() throws IOException { IllegalArgumentException.class, () -> buildAnalyzers(VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT), "edgeNGram") ); - assertThat(e, hasToString(containsString("The [edgeNGram] tokenizer name was deprecated pre 1.0."))); + + assertTrue( + "expected deprecation message in suppressed causes", + Arrays.stream(e.getSuppressed()) + .map(org.opensearch.ExceptionsHelper::unwrapCause) + .map(Throwable::getMessage) + .anyMatch(msg -> msg.contains("The [edgeNGram] tokenizer name was deprecated pre 1.0.")) + ); } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java index 33d92e01a85b1..4fb3c938f3d39 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java @@ -120,7 +120,7 @@ public void testSynonymWordDeleteByAnalyzer() throws IOException { fail("fail! due to synonym word deleted by analyzer"); } catch (Exception e) { assertThat(e, instanceOf(IllegalArgumentException.class)); - assertThat(e.getMessage(), startsWith("Failed to build synonyms")); + assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerWithStopSynonymBeforeSynonym]")); } } @@ -141,7 +141,7 @@ public void testExpandSynonymWordDeleteByAnalyzer() throws IOException { fail("fail! due to synonym word deleted by analyzer"); } catch (Exception e) { assertThat(e, instanceOf(IllegalArgumentException.class)); - assertThat(e.getMessage(), startsWith("Failed to build synonyms")); + assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerExpandWithStopBeforeSynonym]")); } } @@ -269,7 +269,7 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException { TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings); TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings); SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry); - Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); + Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null); try (TokenStream ts = analyzer.tokenStream("field", "text")) { assertThat(ts, instanceOf(KeywordTokenizer.class)); @@ -350,7 +350,7 @@ public void testDisallowedTokenFilters() throws IOException { IllegalArgumentException e = expectThrows( IllegalArgumentException.class, "Expected IllegalArgumentException for factory " + factory, - () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null) + () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null) ); assertEquals(factory, "Token filter [" + factory + "] cannot be used to parse synonyms", e.getMessage()); @@ -443,4 +443,91 @@ public void testSynonymAnalyzerWithWordDelimiter() throws IOException { assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 }); } } + + /** + * Test the core dependency resolution issue from GitHub #18037: + * synonym_graph with custom synonym_analyzer should work even when + * the main analyzer contains word_delimiter_graph that would normally + * cause "cannot be used to parse synonyms" error. + * + * This test intentionally declares the dependent analyzer before the + * synonym analyzer to ensure the system resolves dependencies + * automatically without requiring a manual "order" setting. + */ + public void testSynonymAnalyzerDependencyResolution() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + + // Declare dependent analyzer FIRST intentionally + .put("index.analysis.analyzer.main_analyzer.type", "custom") + .put("index.analysis.analyzer.main_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.main_analyzer.filter", "lowercase", "test_word_delimiter", "test_synonyms") + + // Problematic filter for synonym parsing + .put("index.analysis.filter.test_word_delimiter.type", "word_delimiter_graph") + .put("index.analysis.filter.test_word_delimiter.generate_word_parts", true) + + // Synonym filter referencing another analyzer + .put("index.analysis.filter.test_synonyms.type", "synonym_graph") + .putList("index.analysis.filter.test_synonyms.synonyms", "laptop,notebook") + .put("index.analysis.filter.test_synonyms.synonym_analyzer", "simple_synonym_analyzer") + + // Define the synonym analyzer AFTER the main analyzer + .put("index.analysis.analyzer.simple_synonym_analyzer.type", "custom") + .put("index.analysis.analyzer.simple_synonym_analyzer.tokenizer", "standard") + .build(); + + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test_index", settings); + + // Should succeed with the fix (would fail before due to registration order) + IndexAnalyzers analyzers = new AnalysisModule( + TestEnvironment.newEnvironment(settings), + Collections.singletonList(new CommonAnalysisModulePlugin()) + ).getAnalysisRegistry().build(idxSettings); + + assertNotNull("main_analyzer should be created", analyzers.get("main_analyzer")); + assertNotNull("simple_synonym_analyzer should be created", analyzers.get("simple_synonym_analyzer")); + } + + /** + * Verifies that circular synonym_analyzer dependencies are detected + * and rejected during analyzer construction. + */ + public void testCircularSynonymAnalyzerDependency() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + + // Analyzer A depends on B + .put("index.analysis.analyzer.analyzer_a.type", "custom") + .put("index.analysis.analyzer.analyzer_a.tokenizer", "standard") + .putList("index.analysis.analyzer.analyzer_a.filter", "syn_filter_a") + + .put("index.analysis.filter.syn_filter_a.type", "synonym_graph") + .putList("index.analysis.filter.syn_filter_a.synonyms", "foo,bar") + .put("index.analysis.filter.syn_filter_a.synonym_analyzer", "analyzer_b") + + // Analyzer B depends on A + .put("index.analysis.analyzer.analyzer_b.type", "custom") + .put("index.analysis.analyzer.analyzer_b.tokenizer", "standard") + .putList("index.analysis.analyzer.analyzer_b.filter", "syn_filter_b") + + .put("index.analysis.filter.syn_filter_b.type", "synonym_graph") + .putList("index.analysis.filter.syn_filter_b.synonyms", "baz,qux") + .put("index.analysis.filter.syn_filter_b.synonym_analyzer", "analyzer_a") + + .build(); + + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test_index", settings); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisModulePlugin())) + .getAnalysisRegistry() + .build(idxSettings) + ); + + assertThat(e.getMessage(), startsWith("Circular analyzer dependency")); + } } diff --git a/server/src/main/java/org/opensearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/opensearch/index/analysis/AnalysisRegistry.java index 0162268131eb2..c0bc972ff4f4a 100644 --- a/server/src/main/java/org/opensearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/opensearch/index/analysis/AnalysisRegistry.java @@ -51,12 +51,17 @@ import java.io.Closeable; import java.io.IOException; import java.io.UncheckedIOException; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; +import java.util.Deque; import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiFunction; import java.util.function.Function; @@ -305,7 +310,7 @@ public NamedAnalyzer buildCustomAnalyzer( } catch (IOException e) { throw new UncheckedIOException(e); } - }); + }, null); tokenFilterFactories.add(tff); } @@ -362,7 +367,107 @@ public Map buildCharFilterFactories(IndexSettings ind private Map> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException { final Map analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer"); - return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories); + final Map filterSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_FILTER); + + Map> dependencies = buildAnalyzerDependencies(analyzersSettings, filterSettings); + List buildOrder = topologicalSortAnalyzers(dependencies); + + Map sortedAnalyzersSettings = new LinkedHashMap<>(); + for (String analyzerName : buildOrder) { + sortedAnalyzersSettings.put(analyzerName, analyzersSettings.get(analyzerName)); + } + + return buildMapping( + Component.ANALYZER, + indexSettings, + sortedAnalyzersSettings, + analyzers, + prebuiltAnalysis.analyzerProviderFactories + ); + } + + private static Map> buildAnalyzerDependencies( + Map analyzersSettings, + Map filterSettings + ) { + Map> dependencies = new LinkedHashMap<>(); + + for (Map.Entry analyzerEntry : analyzersSettings.entrySet()) { + String analyzerName = analyzerEntry.getKey(); + Settings analyzerSettings = analyzerEntry.getValue(); + + Set analyzerDependencies = new LinkedHashSet<>(); + for (String filterName : analyzerSettings.getAsList("filter")) { + Settings fs = filterSettings.get(filterName); + if (fs == null || fs.isEmpty()) { + continue; + } + + String type = fs.get("type"); + if ("synonym".equals(type) || "synonym_graph".equals(type)) { + String synonymAnalyzer = fs.get("synonym_analyzer"); + if (synonymAnalyzer != null + && synonymAnalyzer.equals(analyzerName) == false + && analyzersSettings.containsKey(synonymAnalyzer)) { + analyzerDependencies.add(synonymAnalyzer); + } + } + } + + dependencies.put(analyzerName, analyzerDependencies); + } + + return dependencies; + } + + // kahn's algorithm topological sort + private static List topologicalSortAnalyzers(Map> analyzerDependencies) { + Map remainingDependencies = new LinkedHashMap<>(); // inDegree + Map> dependentsByAnalyzer = new LinkedHashMap<>(); // reverseEdges + + for (String analyzer : analyzerDependencies.keySet()) { + remainingDependencies.put(analyzer, 0); + dependentsByAnalyzer.put(analyzer, new ArrayList<>()); + } + + for (Map.Entry> entry : analyzerDependencies.entrySet()) { + String analyzer = entry.getKey(); + for (String dependency : entry.getValue()) { + if (analyzerDependencies.containsKey(dependency)) { + remainingDependencies.put(analyzer, remainingDependencies.get(analyzer) + 1); + dependentsByAnalyzer.get(dependency).add(analyzer); + } + } + } + + Deque ready = new ArrayDeque<>(); // queue + for (Map.Entry entry : remainingDependencies.entrySet()) { + if (entry.getValue() == 0) { + ready.addLast(entry.getKey()); + } + } + + List buildOrder = new ArrayList<>(); + while (ready.isEmpty() == false) { + String builtAnalyzer = ready.removeFirst(); + buildOrder.add(builtAnalyzer); + + for (String dependent : dependentsByAnalyzer.get(builtAnalyzer)) { + int remaining = remainingDependencies.get(dependent) - 1; + remainingDependencies.put(dependent, remaining); + if (remaining == 0) { + ready.addLast(dependent); + } + } + } + + if (buildOrder.size() != analyzerDependencies.size()) { + Set cycle = new LinkedHashSet<>(analyzerDependencies.keySet()); + cycle.removeAll(buildOrder); + throw new IllegalArgumentException("Circular analyzer dependency detected via synonym_analyzer: " + cycle); + } + + return buildOrder; } private Map> buildNormalizerFactories(IndexSettings indexSettings) throws IOException { @@ -486,7 +591,7 @@ private Map buildMapping( Map> defaultInstance ) throws IOException { Settings defaultSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build(); - Map factories = new HashMap<>(); + Map factories = new LinkedHashMap<>(); // keep insertion order for (Map.Entry entry : settingsMap.entrySet()) { String name = entry.getKey(); Settings currentSettings = entry.getValue(); @@ -637,20 +742,25 @@ public IndexAnalyzers build( Map analyzers = new HashMap<>(); Map normalizers = new HashMap<>(); Map whitespaceNormalizers = new HashMap<>(); + Map buildErrors = new LinkedHashMap<>(); + Map analyzersBuiltSoFar = new HashMap<>(); for (Map.Entry> entry : analyzerProviders.entrySet()) { - analyzers.merge( - entry.getKey(), - produceAnalyzer( + try { + NamedAnalyzer namedAnalyzer = produceAnalyzer( entry.getKey(), entry.getValue(), tokenFilterFactoryFactories, charFilterFactoryFactories, - tokenizerFactoryFactories - ), - (k, v) -> { + tokenizerFactoryFactories, + analyzersBuiltSoFar + ); + analyzers.merge(entry.getKey(), namedAnalyzer, (k, v) -> { throw new IllegalStateException("already registered analyzer with name: " + entry.getKey()); - } - ); + }); + analyzersBuiltSoFar.put(entry.getKey(), namedAnalyzer); + } catch (Exception e) { + buildErrors.put(entry.getKey(), e); + } } for (Map.Entry> entry : normalizerProviders.entrySet()) { processNormalizerFactory( @@ -707,6 +817,14 @@ public IndexAnalyzers build( throw new IllegalArgumentException("analyzer name must not start with '_'. got \"" + analyzer.getKey() + "\""); } } + + if (!buildErrors.isEmpty()) { + IllegalArgumentException aggregated = new IllegalArgumentException("Failed to build analyzers: " + buildErrors.keySet()); + buildErrors.forEach( + (name, ex) -> aggregated.addSuppressed(new IllegalArgumentException("[" + name + "] " + ex.getMessage(), ex)) + ); + throw aggregated; + } return new IndexAnalyzers(analyzers, normalizers, whitespaceNormalizers); } @@ -716,6 +834,17 @@ private static NamedAnalyzer produceAnalyzer( Map tokenFilters, Map charFilters, Map tokenizers + ) { + return produceAnalyzer(name, analyzerFactory, tokenFilters, charFilters, tokenizers, Collections.emptyMap()); + } + + private static NamedAnalyzer produceAnalyzer( + String name, + AnalyzerProvider analyzerFactory, + Map tokenFilters, + Map charFilters, + Map tokenizers, + Map analyzersBuiltSoFar ) { /* * Lucene defaults positionIncrementGap to 0 in all analyzers but @@ -725,7 +854,7 @@ private static NamedAnalyzer produceAnalyzer( */ int overridePositionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP; if (analyzerFactory instanceof CustomAnalyzerProvider customAnalyzerProvider) { - customAnalyzerProvider.build(tokenizers, charFilters, tokenFilters); + customAnalyzerProvider.build(tokenizers, charFilters, tokenFilters, analyzersBuiltSoFar); /* * Custom analyzers already default to the correct, version * dependent positionIncrementGap and the user is be able to diff --git a/server/src/main/java/org/opensearch/index/analysis/AnalyzerComponents.java b/server/src/main/java/org/opensearch/index/analysis/AnalyzerComponents.java index bc9438d7eaecb..0c2568f39f841 100644 --- a/server/src/main/java/org/opensearch/index/analysis/AnalyzerComponents.java +++ b/server/src/main/java/org/opensearch/index/analysis/AnalyzerComponents.java @@ -32,9 +32,11 @@ package org.opensearch.index.analysis; +import org.apache.lucene.analysis.Analyzer; import org.opensearch.common.settings.Settings; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; @@ -69,6 +71,17 @@ static AnalyzerComponents createComponents( final Map tokenizers, final Map charFilters, final Map tokenFilters + ) { + return createComponents(name, analyzerSettings, tokenizers, charFilters, tokenFilters, Collections.emptyMap()); + } + + static AnalyzerComponents createComponents( + String name, + Settings analyzerSettings, + final Map tokenizers, + final Map charFilters, + final Map tokenFilters, + final Map analyzersBuiltSoFar ) { String tokenizerName = analyzerSettings.get("tokenizer"); if (tokenizerName == null) { @@ -103,7 +116,13 @@ static AnalyzerComponents createComponents( "Custom Analyzer [" + name + "] failed to find filter under name " + "[" + tokenFilterName + "]" ); } - tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(tokenizer, charFiltersList, tokenFilterList, tokenFilters::get); + tokenFilter = tokenFilter.getChainAwareTokenFilterFactory( + tokenizer, + charFiltersList, + tokenFilterList, + tokenFilters::get, + analyzersBuiltSoFar::get + ); tokenFilterList.add(tokenFilter); } diff --git a/server/src/main/java/org/opensearch/index/analysis/CustomAnalyzerProvider.java b/server/src/main/java/org/opensearch/index/analysis/CustomAnalyzerProvider.java index f339b6fc302bf..327fd54aab5eb 100644 --- a/server/src/main/java/org/opensearch/index/analysis/CustomAnalyzerProvider.java +++ b/server/src/main/java/org/opensearch/index/analysis/CustomAnalyzerProvider.java @@ -61,9 +61,10 @@ public CustomAnalyzerProvider(IndexSettings indexSettings, String name, Settings void build( final Map tokenizers, final Map charFilters, - final Map tokenFilters + final Map tokenFilters, + final Map analyzersBuiltSoFar ) { - customAnalyzer = create(name(), analyzerSettings, tokenizers, charFilters, tokenFilters); + customAnalyzer = create(name(), analyzerSettings, tokenizers, charFilters, tokenFilters, analyzersBuiltSoFar); } /** @@ -75,12 +76,20 @@ private static Analyzer create( Settings analyzerSettings, Map tokenizers, Map charFilters, - Map tokenFilters + Map tokenFilters, + Map analyzersBuiltSoFar ) { int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP; positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap); int offsetGap = analyzerSettings.getAsInt("offset_gap", -1); - AnalyzerComponents components = createComponents(name, analyzerSettings, tokenizers, charFilters, tokenFilters); + AnalyzerComponents components = createComponents( + name, + analyzerSettings, + tokenizers, + charFilters, + tokenFilters, + analyzersBuiltSoFar + ); if (components.analysisMode().equals(AnalysisMode.SEARCH_TIME)) { return new ReloadableCustomAnalyzer(components, positionIncrementGap, offsetGap); } else { diff --git a/server/src/main/java/org/opensearch/index/analysis/TokenFilterFactory.java b/server/src/main/java/org/opensearch/index/analysis/TokenFilterFactory.java index af18927de8a98..1299d755d1f25 100644 --- a/server/src/main/java/org/opensearch/index/analysis/TokenFilterFactory.java +++ b/server/src/main/java/org/opensearch/index/analysis/TokenFilterFactory.java @@ -32,6 +32,7 @@ package org.opensearch.index.analysis; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.opensearch.common.annotation.PublicApi; @@ -86,6 +87,34 @@ default TokenFilterFactory getChainAwareTokenFilterFactory( return this; } + /** + * Like {@link #getChainAwareTokenFilterFactory(TokenizerFactory, List, List, Function)}, + * but also provides a resolver for analyzers that have already been constructed + * earlier in this index build (e.g., a {@code synonym_analyzer} referenced by name). + * + *

Example: {@code SynonymTokenFilterFactory} can resolve the analyzer named by + * {@code synonym_analyzer} using {@code analyzersBuiltSoFar}.

+ * + * The call {@code analyzersBuiltSoFar.apply(name)} returns an {@link org.apache.lucene.analysis.Analyzer} + * if (and only if) that analyzer was already built in the current index build; it may return {@code null}. + * + * @param tokenizer the TokenizerFactory for the preceding chain + * @param charFilters any CharFilterFactories for the preceding chain + * @param previousTokenFilters a list of TokenFilterFactories in the preceding chain + * @param allFilters access to previously defined TokenFilterFactories + * @param analyzersBuiltSoFar {@code name -> Analyzer} for analyzers already built earlier in this index build (may return null) + * @since 3.3.0 + */ + default TokenFilterFactory getChainAwareTokenFilterFactory( + TokenizerFactory tokenizer, + List charFilters, + List previousTokenFilters, + Function allFilters, + Function analyzersBuiltSoFar + ) { + return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters); + } + /** * Return a version of this TokenFilterFactory appropriate for synonym parsing *

diff --git a/server/src/test/java/org/opensearch/index/analysis/AnalysisRegistryTests.java b/server/src/test/java/org/opensearch/index/analysis/AnalysisRegistryTests.java index 2c7f11063cf0f..8027deb028011 100644 --- a/server/src/test/java/org/opensearch/index/analysis/AnalysisRegistryTests.java +++ b/server/src/test/java/org/opensearch/index/analysis/AnalysisRegistryTests.java @@ -60,6 +60,7 @@ import org.opensearch.test.VersionUtils; import java.io.IOException; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -68,6 +69,7 @@ import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.mockito.Mockito.mock; @@ -480,7 +482,57 @@ public Map> getTokenFilters() { new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(plugin)).getAnalysisRegistry() .build(exceptionSettings); }); - assertEquals("Cannot use token filter [exception]", e.getMessage()); + assertTrue( + "expected token filter exception in suppressed causes", + Arrays.stream(e.getSuppressed()) + .map(org.opensearch.ExceptionsHelper::unwrapCause) + .map(Throwable::getMessage) + .anyMatch(msg -> msg.contains("Cannot use token filter [exception]")) + ); + } + + public void testAggregatesAnalyzerBuildFailuresAndContinuesRegistrationLoop() { + // Build settings with two broken analyzers (different failure modes) + // and one valid analyzer to prove we keep iterating past the first failure. + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + + // bad1: unknown token filter name -> will fail when producing filter factory + .put("index.analysis.analyzer.bad1.type", "custom") + .put("index.analysis.analyzer.bad1.tokenizer", "standard") + .putList("index.analysis.analyzer.bad1.filter", "lowercase", "does_not_exist_filter") + + // bad2: unknown tokenizer -> will fail when producing tokenizer factory + .put("index.analysis.analyzer.bad2.type", "custom") + .put("index.analysis.analyzer.bad2.tokenizer", "does_not_exist_tokenizer") + .putList("index.analysis.analyzer.bad2.filter", "lowercase") + + // good: valid analyzer we expect to be *attempted* after bad1 + .put("index.analysis.analyzer.no_split_synonym_analyzer.type", "custom") + .put("index.analysis.analyzer.no_split_synonym_analyzer.tokenizer", "standard") + .build(); + + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.emptyList()).getAnalysisRegistry() + .build(idxSettings) + ); + + // After the fix, we expect a single aggregated exception that mentions both failing analyzers + assertThat(e.getMessage(), containsString("bad1")); + assertThat(e.getMessage(), containsString("bad2")); + + // And we expect two suppressed causes (one per bad analyzer) + Throwable[] suppressed = e.getSuppressed(); + assertNotNull(suppressed); + assertEquals(2, suppressed.length); + + // Sanity: each suppressed cause should carry a helpful message + assertThat(suppressed[0].getMessage(), containsString("bad")); + assertThat(suppressed[1].getMessage(), containsString("bad")); } }