change(web): use QuotientNodeFinalizer within TokenizationCorrector

jahorton · jahorton · commit cf60a5e7b10a · 2026-04-17T16:44:40.000-05:00
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts
@@ -13,7 +13,7 @@ import { PriorityQueue } from "@keymanapp/web-utils";
 import { ContextToken } from "./context-token.js";
 import { CorrectionSearchable, PathResult } from "./correction-searchable.js";
 import { ContextTokenization } from "./context-tokenization.js";
-import { SearchQuotientNode } from "./search-quotient-node.js";
+import { QuotientNodeFinalizer } from "./quotient-node-finalizer.js";
 import { TokenizationResultMapping } from "./tokenization-result-mapping.js";
 
 // PathResult needs to be generic:
@@ -31,13 +31,14 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
   public readonly tokenization: ContextTokenization;
   private readonly tailCorrectionLength: number
 
-  private readonly _uncorrectables: SearchQuotientNode[];
-  private readonly _correctables: SearchQuotientNode[];
-  private _predictable?: SearchQuotientNode;
+  private readonly _uncorrectables: QuotientNodeFinalizer[];
+  private readonly _correctables: QuotientNodeFinalizer[];
+  private _predictable?: QuotientNodeFinalizer;
 
-  private selectionQueue: PriorityQueue<SearchQuotientNode>;
+  private selectionQueue: PriorityQueue<QuotientNodeFinalizer>;
   private tokenCostMap: Map<number, number>;
-  private _lockedTokenResults: Map<SearchQuotientNode, TokenResult>;
+  private tokenLookupMap: Map<number, ContextToken>;
+  private _lockedTokenResults: Map<number, TokenResult>;
   private lastTotalCost: number;
   private handleHasBeenCalled: boolean = false;
 
@@ -55,20 +56,20 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
   }
 
   get uncorrectableTokens(): ReadonlyArray<ContextToken> {
-    return this.orderedTokens.filter((t) => this._uncorrectables.find((c) => c.spaceId == t.spaceId));
+    return this._uncorrectables.map((c) => this.tokenLookupMap.get(c.spaceId));
   }
 
   get correctableTokens(): ReadonlyArray<ContextToken> {
-    return this.orderedTokens.filter((t) => this._correctables.find((c) => c.spaceId == t.spaceId));
+    return this._correctables.map((c) => this.tokenLookupMap.get(c.spaceId));
   }
 
   get predictableToken(): ContextToken {
-    return this.orderedTokens.find((t) => this._predictable?.spaceId == t.spaceId);
+    return this.tokenLookupMap.get(this._predictable?.spaceId);
   }
 
   get lockedTokenResults(): ReadonlyMap<ContextToken, TokenResult> {
     return new Map([...this._lockedTokenResults.entries()]
-      .map((tuple) => [this.orderedTokens.find((t) => t.searchModule == tuple[0]), tuple[1]]));
+      .map((tuple) => [this.tokenLookupMap.get(tuple[0]), tuple[1]]));
   }
 
   // Will have actual result sequences.
@@ -96,13 +97,18 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
       throw new Error(`Length for correction near tail may not be 0.`);
     }
 
-    const correctables = this.orderedTokens;
+    const orderedTokens = this.orderedTokens;
 
     this._uncorrectables = [];
     this._correctables = [];
 
-    correctables.forEach((token, index) => {
-      const searchModule = token.searchModule;
+    this.tokenLookupMap = new Map();
+
+    orderedTokens.forEach((token, index) => {
+      // New issue:  this mangles the space IDs!  We almost certainly need some
+      // sort of proper map to the source token.
+      const searchModule = new QuotientNodeFinalizer(token.searchModule, index == orderedTokens.length - 1);
+      this.tokenLookupMap.set(searchModule.spaceId, token);
       if(!filterClosure(token)) {
         this._uncorrectables.push(searchModule);
       } else if(index == tailCorrectionLength - 1) {
@@ -116,7 +122,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     const uncorrectables = this._uncorrectables;
     uncorrectables.forEach((uncorrectable) => {
       const lockedResult = uncorrectable.bestExample;
-      this._lockedTokenResults.set(uncorrectable, {
+      this._lockedTokenResults.set(uncorrectable.spaceId, {
         matchString: lockedResult.text,
         inputSamplingCost: 0,
         knownCost: -Math.log(lockedResult.p),
@@ -127,7 +133,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     let totalCost = uncorrectables.reduce((accum, curr) => accum - Math.log(curr.bestExample.p), 0);
     const tokenCostMap = this.tokenCostMap = new Map<number, number>();
 
-    const correctablesToQueue = this._correctables.concat(this.predictableToken?.searchModule ?? []);
+    const correctablesToQueue = this._correctables.concat(this._predictable ?? []);
     correctablesToQueue.forEach((t) => {
       totalCost += t.currentCost;
       tokenCostMap.set(t.spaceId, t.currentCost);
@@ -137,7 +143,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
 
     // Compute a weighting for each token's search space based the increase in
     // tokenization cost that it represents.
-    const tokenUpdateCost = (searchModule: SearchQuotientNode) => searchModule.currentCost - (tokenCostMap.get(searchModule.spaceId) ?? 0)
+    const tokenUpdateCost = (searchModule: QuotientNodeFinalizer) => searchModule.currentCost - (tokenCostMap.get(searchModule.spaceId) ?? 0)
     this.selectionQueue = new PriorityQueue((a, b) => {
       const aUpdateCost = tokenUpdateCost(a);
       const bUpdateCost = tokenUpdateCost(b);
@@ -150,12 +156,15 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     this.selectionQueue.enqueueAll(correctablesToQueue);
   }
 
-  private getUpdatedTotalCost(updatedCorrectable: SearchQuotientNode, tokenCost: number): number {
+  private getUpdatedTotalCost(updatedCorrectable: QuotientNodeFinalizer, tokenCost: number): number {
     return this.lastTotalCost + tokenCost - (this.tokenCostMap.get(updatedCorrectable.spaceId) ?? 0);
   }
 
   private collateResults(): TokenizationResultMapping {
-    return new TokenizationResultMapping(this.orderedTokens.map((t) => this._lockedTokenResults.get(t.searchModule)), this);
+    // The tokenLookupMap was constructed in the same ordering as the tokens; we can iterate the keys
+    // or entries to keep everything in order.
+    const results = [...this.tokenLookupMap.keys()].map((spaceId) => this._lockedTokenResults.get(spaceId))
+    return new TokenizationResultMapping(results, this);
   }
 
   handleNextNode(): PathResult<TokenizationResultMapping> {
@@ -218,7 +227,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     }
 
     // Either way, update the token -> correction-string map with the obtained result.
-    this._lockedTokenResults.set(correctableToUpdate, tokenResult.mapping);
+    this._lockedTokenResults.set(correctableToUpdate.spaceId, tokenResult.mapping);
 
     // If we have a correction for all components in need of correction, allow
     // searching for alternative corrections for the 'unbound' token.
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts
@@ -17,8 +17,10 @@ import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'
 import {
   ContextToken,
   ContextTokenization,
+  correction,
   correctionValidForAutoSelect,
   generateSubsetId,
+  getBestMatches,
   LegacyQuotientSpur,
   models,
   PathInputProperties,
@@ -41,6 +43,10 @@ const plainModel = new TrieModel(
   }
 );
 
+function buildTestTimer() {
+  return new correction.ExecutionTimer(Number.MAX_VALUE, Number.MAX_VALUE);
+}
+
 function buildFixture_therefore() {
   let ID_SEED = 11;
 
@@ -420,5 +426,42 @@ describe('TokenizationCorrector', () => {
       const nilResult = instance.handleNextNode();
       assert.equal(nilResult.type, 'none');
     });
+
+    describe('with getBestMatches()', () => {
+      it('finds results from each of two tokenization variants sharing lower-level SearchQuotientNodes', async () => {
+        const fixture = buildFixture_therefore();
+
+        // Issue:  `theref` is built completely off of the multi-token's `the` -
+        // and so starting on `the` later will fail! It "already produced" the
+        // result, after all.
+        //
+        // ... which is ANOTHER benefit to a prospective TerminalQuotientNode.
+        // It always gets a chance to process it!
+        const tokenizations = [fixture.theref, fixture.the_ef];
+        const correctors = tokenizations.map((t) => new TokenizationCorrector(t, t.tokens.length, fixture.filter));
+
+        let haveSeenSingleTokenCorrection = false;
+        let haveSeenThreeTokenCorrection = false;
+        for await(let phraseMatch of getBestMatches<
+          ReadonlyArray<TokenResult>,
+          TokenizationResultMapping,
+          TokenizationCorrector
+          >(correctors, buildTestTimer())) {
+
+          if(phraseMatch.matchedResult.length == 1) {
+            haveSeenSingleTokenCorrection = true;
+          } else if(phraseMatch.matchedResult.length == 3) {
+            haveSeenThreeTokenCorrection = true;
+          }
+
+          if(haveSeenSingleTokenCorrection && haveSeenThreeTokenCorrection) {
+            break;
+          }
+        }
+
+        assert.isTrue(haveSeenSingleTokenCorrection, 'A single-token correction was expected but not found');
+        assert.isTrue(haveSeenThreeTokenCorrection,  'A three-token correction was expected but not found');
+      });
+    });
   });
 });