apache · rmuir · Dec 4, 2021 · Nov 29, 2021 · Dec 4, 2021 · Dec 4, 2021
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -7,7 +7,10 @@ http://s.apache.org/luceneversions
 
 API Changes
 ---------------------
-(No changes)
+
+* LUCENE-10010: AutomatonQuery, CompiledAutomaton, and RunAutomaton
+  classes no longer determinize NFAs. Instead it is the responsibility
+  of the caller to determinize.  (Robert Muir)
 
 New Features
 ---------------------

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -17,6 +17,14 @@
 
 # Apache Lucene Migration Guide
 
+## Migration from Lucene 9.x to Lucene 10.0
+
+### AutomatonQuery/CompiledAutomaton/RunAutomaton no longer determinize (LUCENE-10010)
+
+These classes no longer take a `determinizeWorkLimit` and no longer determinize
+behind the scenes. It is the responsibility of the caller to to call
+`Operations.determinize()` for DFA execution.
+
 ## Migration from Lucene 9.0 to Lucene 9.1
 
 ### Minor syntactical changes in StandardQueryParser (LUCENE-10223)

diff --git a/...lysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/...lysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
@@ -88,7 +88,7 @@ public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
       throw new IllegalArgumentException("please determinize the incoming automaton first");
     }
 
-    runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+    runDFA = new CharacterRunAutomaton(dfa);
   }
 
   private void fillToken(int offsetStart) {

diff --git a/...e/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/...e/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
@@ -98,7 +98,7 @@ public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
       throw new IllegalArgumentException("please determinize the incoming automaton first");
     }
 
-    runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+    runDFA = new CharacterRunAutomaton(dfa);
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
@@ -25,7 +25,6 @@
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.Operations;
 
 /**
  * A {@link Query} that will match terms against a finite-state machine.
@@ -35,10 +34,9 @@
  * Alternatively, it can be created from a regular expression with {@link RegexpQuery} or from the
  * standard Lucene wildcard syntax with {@link WildcardQuery}.
  *
- * <p>When the query is executed, it will create an equivalent DFA of the finite-state machine, and
- * will enumerate the term dictionary in an intelligent way to reduce the number of comparisons. For
- * example: the regular expression of <code>[dl]og?</code> will make approximately four comparisons:
- * do, dog, lo, and log.
+ * <p>When the query is executed, it will will enumerate the term dictionary in an intelligent way
+ * to reduce the number of comparisons. For example: the regular expression of <code>[dl]og?</code>
+ * will make approximately four comparisons: do, dog, lo, and log.
  *
  * @lucene.experimental
  */
@@ -63,9 +61,10 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
    * @param term Term containing field and possibly some pattern structure. The term text is
    *     ignored.
    * @param automaton Automaton to run, terms that are accepted are considered a match.
+   * @throws IllegalArgumentException if automaton is not determinized
    */
   public AutomatonQuery(final Term term, Automaton automaton) {
-    this(term, automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+    this(term, automaton, false);
   }
 
   /**
@@ -74,34 +73,17 @@ public AutomatonQuery(final Term term, Automaton automaton) {
    * @param term Term containing field and possibly some pattern structure. The term text is
    *     ignored.
    * @param automaton Automaton to run, terms that are accepted are considered a match.
-   * @param determinizeWorkLimit maximum effort to spend determinizing the automaton. If the
-   *     automaton would need more than this much effort, TooComplexToDeterminizeException is
-   *     thrown. Higher numbers require more space but can process more complex automata.
-   */
-  public AutomatonQuery(final Term term, Automaton automaton, int determinizeWorkLimit) {
-    this(term, automaton, determinizeWorkLimit, false);
-  }
-
-  /**
-   * Create a new AutomatonQuery from an {@link Automaton}.
-   *
-   * @param term Term containing field and possibly some pattern structure. The term text is
-   *     ignored.
-   * @param automaton Automaton to run, terms that are accepted are considered a match.
-   * @param determinizeWorkLimit maximum effort to spend determinizing the automaton. If the
-   *     automaton will need more than this much effort, TooComplexToDeterminizeException is thrown.
-   *     Higher numbers require more space but can process more complex automata.
    * @param isBinary if true, this automaton is already binary and will not go through the
    *     UTF32ToUTF8 conversion
+   * @throws IllegalArgumentException if automaton is not determinized
    */
-  public AutomatonQuery(
-      final Term term, Automaton automaton, int determinizeWorkLimit, boolean isBinary) {
+  public AutomatonQuery(final Term term, Automaton automaton, boolean isBinary) {
     super(term.field());
     this.term = term;
     this.automaton = automaton;
     this.automatonIsBinary = isBinary;
     // TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
-    this.compiled = new CompiledAutomaton(automaton, null, true, determinizeWorkLimit, isBinary);
+    this.compiled = new CompiledAutomaton(automaton, null, true, isBinary);
 
     this.ramBytesUsed =
         BASE_RAM_BYTES + term.ramBytesUsed() + automaton.ramBytesUsed() + compiled.ramBytesUsed();

diff --git a/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
@@ -30,9 +30,7 @@ public class PrefixQuery extends AutomatonQuery {
 
   /** Constructs a query for terms starting with <code>prefix</code>. */
   public PrefixQuery(Term prefix) {
-    // It's OK to pass unlimited determinizeWorkLimit: the automaton is born small and
-    // determinized:
-    super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
+    super(prefix, toAutomaton(prefix.bytes()), true);
   }
 
   /** Build an automaton accepting all terms with the specified prefix. */

diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
@@ -139,9 +139,10 @@ public RegexpQuery(
       int determinizeWorkLimit) {
     super(
         term,
-        new RegExp(term.text(), syntax_flags, match_flags)
-            .toAutomaton(provider, determinizeWorkLimit),
-        determinizeWorkLimit);
+        Operations.determinize(
+            new RegExp(term.text(), syntax_flags, match_flags)
+                .toAutomaton(provider, determinizeWorkLimit),
+            determinizeWorkLimit));
   }
 
   /** Returns the regexp of this query wrapped in a Term. */

diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -140,13 +140,17 @@ public void visit(QueryVisitor visitor) {
     }
   }
 
+  // TODO: this is extremely slow. we should not be doing this.
   private ByteRunAutomaton asByteRunAutomaton() {
     TermIterator iterator = termData.iterator();
     List<Automaton> automata = new ArrayList<>();
     for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
       automata.add(Automata.makeBinary(term));
     }
-    return new CompiledAutomaton(Operations.union(automata)).runAutomaton;
+    Automaton automaton =
+        Operations.determinize(
+            Operations.union(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+    return new CompiledAutomaton(automaton).runAutomaton;
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java
@@ -63,7 +63,6 @@ public TermRangeQuery(
     super(
         new Term(field, lowerTerm),
         toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper),
-        Integer.MAX_VALUE,
         true);
     this.lowerTerm = lowerTerm;
     this.upperTerm = upperTerm;

diff --git a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
@@ -47,7 +47,7 @@ public class WildcardQuery extends AutomatonQuery {
 
   /** Constructs a query for terms matching <code>term</code>. */
   public WildcardQuery(Term term) {
-    super(term, toAutomaton(term));
+    this(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
   }
 
   /**
@@ -59,7 +59,7 @@ public WildcardQuery(Term term) {
    *     otherwise know what to specify.
    */
   public WildcardQuery(Term term, int determinizeWorkLimit) {
-    super(term, toAutomaton(term), determinizeWorkLimit);
+    super(term, toAutomaton(term, determinizeWorkLimit));
   }
 
   /**
@@ -68,7 +68,7 @@ public WildcardQuery(Term term, int determinizeWorkLimit) {
    * @lucene.internal
    */
   @SuppressWarnings("fallthrough")
-  public static Automaton toAutomaton(Term wildcardquery) {
+  public static Automaton toAutomaton(Term wildcardquery, int determinizeWorkLimit) {
     List<Automaton> automata = new ArrayList<>();
 
     String wildcardText = wildcardquery.text();
@@ -97,7 +97,7 @@ public static Automaton toAutomaton(Term wildcardquery) {
       i += length;
     }
 
-    return Operations.concatenate(automata);
+    return Operations.determinize(Operations.concatenate(automata), determinizeWorkLimit);
   }
 
   /** Returns the pattern term. */

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@@ -19,14 +19,30 @@
 /** Automaton representation for matching UTF-8 byte[]. */
 public class ByteRunAutomaton extends RunAutomaton {
 
-  /** Converts incoming automaton to byte-based (UTF32ToUTF8) first */
+  /**
+   * Converts incoming automaton to byte-based (UTF32ToUTF8) first
+   *
+   * @throws IllegalArgumentException if the automaton is not deterministic
+   */
   public ByteRunAutomaton(Automaton a) {
-    this(a, false, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+    this(a, false);
   }
 
-  /** expert: if isBinary is true, the input is already byte-based */
-  public ByteRunAutomaton(Automaton a, boolean isBinary, int determinizeWorkLimit) {
-    super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, determinizeWorkLimit);
+  /**
+   * expert: if isBinary is true, the input is already byte-based
+   *
+   * @throws IllegalArgumentException if the automaton is not deterministic
+   */
+  public ByteRunAutomaton(Automaton a, boolean isBinary) {
+    super(isBinary ? a : convert(a), 256);
+  }
+
+  static Automaton convert(Automaton a) {
+    if (!a.isDeterministic()) {
+      throw new IllegalArgumentException("Automaton must be deterministic");
+    }
+    // we checked the input is a DFA, according to mike this determinization is contained :)
+    return Operations.determinize(new UTF32ToUTF8().convert(a), Integer.MAX_VALUE);
   }
 
   /** Returns true if the given byte array is accepted by this automaton */

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
@@ -18,22 +18,13 @@
 
 /** Automaton representation for matching char[]. */
 public class CharacterRunAutomaton extends RunAutomaton {
-  /** Construct with a default number of determinizeWorkLimit. */
-  public CharacterRunAutomaton(Automaton a) {
-    this(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-  }
-
   /**
-   * Constructor specifying determinizeWorkLimit.
+   * Construct from a DFA
    *
-   * @param a Automaton to match
-   * @param determinizeWorkLimit maximum effort to spend determinizing the automataon. If more
-   *     effort is required then a TooComplexToDeterminizeException is thrown. Use {@link
-   *     Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't otherwise know
-   *     what to specify.
+   * @throws IllegalArgumentException if the automaton is not deterministic
    */
-  public CharacterRunAutomaton(Automaton a, int determinizeWorkLimit) {
-    super(a, Character.MAX_CODE_POINT + 1, determinizeWorkLimit);
+  public CharacterRunAutomaton(Automaton a) {
+    super(a, Character.MAX_CODE_POINT + 1);
   }
 
   /** Returns true if the given string is accepted by this automaton. */

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@@ -34,8 +34,8 @@
 import org.apache.lucene.util.UnicodeUtil;
 
 /**
- * Immutable class holding compiled details for a given Automaton. The Automaton is deterministic,
- * must not have dead states but is not necessarily minimal.
+ * Immutable class holding compiled details for a given Automaton. The Automaton must be
+ * deterministic, must not have dead states but is not necessarily minimal.
  *
  * @lucene.experimental
  */
@@ -131,24 +131,27 @@ private static int findSinkState(Automaton automaton) {
    * Create this. If finite is null, we use {@link Operations#isFinite} to determine whether it is
    * finite. If simplify is true, we run possibly expensive operations to determine if the automaton
    * is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}.
+   *
+   * @throws IllegalArgumentException if the automaton is not deterministic
    */
   public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
-    this(automaton, finite, simplify, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, false);
+    this(automaton, finite, simplify, false);
   }
 
   /**
    * Create this. If finite is null, we use {@link Operations#isFinite} to determine whether it is
    * finite. If simplify is true, we run possibly expensive operations to determine if the automaton
-   * is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. If simplify requires
-   * determinizing the automaton then at most determinizeWorkLimit effort will be spent. Any more
-   * than that will cause a TooComplexToDeterminizeException.
+   * is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}.
+   *
+   * @throws IllegalArgumentException if the automaton is not deterministic
    */
   public CompiledAutomaton(
-      Automaton automaton,
-      Boolean finite,
-      boolean simplify,
-      int determinizeWorkLimit,
-      boolean isBinary) {
+      Automaton automaton, Boolean finite, boolean simplify, boolean isBinary) {
+    // require DFA for input, as this class currently can't handle NFAs.
+    if (!automaton.isDeterministic()) {
+      throw new IllegalArgumentException("Automaton must be deterministic");
+    }
+
     if (automaton.getNumStates() == 0) {
       automaton = new Automaton();
       automaton.createState();
@@ -193,8 +196,6 @@ public CompiledAutomaton(
         return;
       }
 
-      automaton = Operations.determinize(automaton, determinizeWorkLimit);
-
       IntsRef singleton = Operations.getSingleton(automaton);
 
       if (singleton != null) {
@@ -250,8 +251,9 @@ public CompiledAutomaton(
       }
     }
 
-    // This will determinize the binary automaton for us:
-    runAutomaton = new ByteRunAutomaton(binary, true, determinizeWorkLimit);
+    // We already had a DFA (or threw exception), according to mike UTF32toUTF8 won't "blow up"
+    binary = Operations.determinize(binary, Integer.MAX_VALUE);
+    runAutomaton = new ByteRunAutomaton(binary, true);
 
     this.automaton = runAutomaton.automaton;
 

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
@@ -56,20 +56,13 @@ public abstract class RunAutomaton implements Accountable {
    * Constructs a new <code>RunAutomaton</code> from a deterministic <code>Automaton</code>.
    *
    * @param a an automaton
+   * @throws IllegalArgumentException if the automaton is not deterministic
    */
   protected RunAutomaton(Automaton a, int alphabetSize) {
-    this(a, alphabetSize, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-  }
-
-  /**
-   * Constructs a new <code>RunAutomaton</code> from a deterministic <code>Automaton</code>.
-   *
-   * @param a an automaton
-   * @param determinizeWorkLimit maximum effort to spend while determinizing
-   */
-  protected RunAutomaton(Automaton a, int alphabetSize, int determinizeWorkLimit) {
     this.alphabetSize = alphabetSize;
-    a = Operations.determinize(a, determinizeWorkLimit);
+    if (!a.isDeterministic()) {
+      throw new IllegalArgumentException("Automaton must be deterministic");
+    }
     this.automaton = a;
     points = a.getStartPoints();
     size = Math.max(1, a.getNumStates());

diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
@@ -280,7 +280,7 @@ public void testIntersectRandom() throws IOException {
         a = Automata.makeStringUnion(sortedAcceptTerms);
       }
 
-      final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
+      final CompiledAutomaton c = new CompiledAutomaton(a, true, false, false);
 
       final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
       final Set<BytesRef> acceptTermsSet = new HashSet<>();

diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
@@ -105,7 +105,7 @@ public void testFiniteVersusInfinite() throws Exception {
       // AutomatonTestUtil.minimizeSimple(alternate);
       // System.out.println("minimize done");
       AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton);
-      AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate, Integer.MAX_VALUE);
+      AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate);
 
       ScoreDoc[] origHits = searcher.search(a1, 25).scoreDocs;
       ScoreDoc[] newHits = searcher.search(a2, 25).scoreDocs;