Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,27 @@

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenizerFactory;

public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory {

private final String pattern;
private final Automaton dfa;

public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, settings, name);

pattern = settings.get("pattern", "");
final String pattern = settings.get("pattern", "");
this.dfa = Operations.determinize(new RegExp(pattern).toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

@Override
public Tokenizer create() {
return new SimplePatternSplitTokenizer(pattern);
return new SimplePatternSplitTokenizer(dfa);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,27 @@

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.SimplePatternTokenizer;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenizerFactory;

public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {

private final String pattern;
private final Automaton dfa;

public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, settings, name);

pattern = settings.get("pattern", "");
final String pattern = settings.get("pattern", "");
this.dfa = Operations.determinize(new RegExp(pattern).toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

@Override
public Tokenizer create() {
return new SimplePatternTokenizer(pattern);
return new SimplePatternTokenizer(dfa);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.opensearch.common.settings.Settings;
import org.opensearch.core.index.Index;
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTokenStreamTestCase;

import java.io.IOException;
import java.io.StringReader;

public class SimplePatternTokenizerTests extends OpenSearchTokenStreamTestCase {

public void testComplexRegexRequiringDeterminization() throws IOException {
Settings settings = Settings.builder().put("pattern", "(a+|b+)*c").build();
SimplePatternTokenizerFactory factory = new SimplePatternTokenizerFactory(
IndexSettingsModule.newIndexSettings(new Index("test", "_na_"), Settings.EMPTY),
null,
"test",
settings
);

Tokenizer tokenizer = factory.create();
tokenizer.setReader(new StringReader("aaac bbbbc ac"));
assertTokenStreamContents(tokenizer, new String[] { "aaac", "bbbbc", "ac" });
}

public void testComplexRegexRequiringDeterminizationSplit() throws IOException {
Settings settings = Settings.builder().put("pattern", "(\\s+|,+)*").build();
SimplePatternSplitTokenizerFactory factory = new SimplePatternSplitTokenizerFactory(
IndexSettingsModule.newIndexSettings(new Index("test", "_na_"), Settings.EMPTY),
null,
"test",
settings
);

Tokenizer tokenizer = factory.create();
tokenizer.setReader(new StringReader("word1 word2,,,word3"));
assertTokenStreamContents(tokenizer, new String[] { "word1", "word2", "word3" });
}
}
Loading