Skip to content

Commit cf60a5e

Browse files
committed
change(web): use QuotientNodeFinalizer within TokenizationCorrector
1 parent 86e479c commit cf60a5e

2 files changed

Lines changed: 71 additions & 19 deletions

File tree

web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import { PriorityQueue } from "@keymanapp/web-utils";
1313
import { ContextToken } from "./context-token.js";
1414
import { CorrectionSearchable, PathResult } from "./correction-searchable.js";
1515
import { ContextTokenization } from "./context-tokenization.js";
16-
import { SearchQuotientNode } from "./search-quotient-node.js";
16+
import { QuotientNodeFinalizer } from "./quotient-node-finalizer.js";
1717
import { TokenizationResultMapping } from "./tokenization-result-mapping.js";
1818

1919
// PathResult needs to be generic:
@@ -31,13 +31,14 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
3131
public readonly tokenization: ContextTokenization;
3232
private readonly tailCorrectionLength: number
3333

34-
private readonly _uncorrectables: SearchQuotientNode[];
35-
private readonly _correctables: SearchQuotientNode[];
36-
private _predictable?: SearchQuotientNode;
34+
private readonly _uncorrectables: QuotientNodeFinalizer[];
35+
private readonly _correctables: QuotientNodeFinalizer[];
36+
private _predictable?: QuotientNodeFinalizer;
3737

38-
private selectionQueue: PriorityQueue<SearchQuotientNode>;
38+
private selectionQueue: PriorityQueue<QuotientNodeFinalizer>;
3939
private tokenCostMap: Map<number, number>;
40-
private _lockedTokenResults: Map<SearchQuotientNode, TokenResult>;
40+
private tokenLookupMap: Map<number, ContextToken>;
41+
private _lockedTokenResults: Map<number, TokenResult>;
4142
private lastTotalCost: number;
4243
private handleHasBeenCalled: boolean = false;
4344

@@ -55,20 +56,20 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
5556
}
5657

5758
get uncorrectableTokens(): ReadonlyArray<ContextToken> {
58-
return this.orderedTokens.filter((t) => this._uncorrectables.find((c) => c.spaceId == t.spaceId));
59+
return this._uncorrectables.map((c) => this.tokenLookupMap.get(c.spaceId));
5960
}
6061

6162
get correctableTokens(): ReadonlyArray<ContextToken> {
62-
return this.orderedTokens.filter((t) => this._correctables.find((c) => c.spaceId == t.spaceId));
63+
return this._correctables.map((c) => this.tokenLookupMap.get(c.spaceId));
6364
}
6465

6566
get predictableToken(): ContextToken {
66-
return this.orderedTokens.find((t) => this._predictable?.spaceId == t.spaceId);
67+
return this.tokenLookupMap.get(this._predictable?.spaceId);
6768
}
6869

6970
get lockedTokenResults(): ReadonlyMap<ContextToken, TokenResult> {
7071
return new Map([...this._lockedTokenResults.entries()]
71-
.map((tuple) => [this.orderedTokens.find((t) => t.searchModule == tuple[0]), tuple[1]]));
72+
.map((tuple) => [this.tokenLookupMap.get(tuple[0]), tuple[1]]));
7273
}
7374

7475
// Will have actual result sequences.
@@ -96,13 +97,18 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
9697
throw new Error(`Length for correction near tail may not be 0.`);
9798
}
9899

99-
const correctables = this.orderedTokens;
100+
const orderedTokens = this.orderedTokens;
100101

101102
this._uncorrectables = [];
102103
this._correctables = [];
103104

104-
correctables.forEach((token, index) => {
105-
const searchModule = token.searchModule;
105+
this.tokenLookupMap = new Map();
106+
107+
orderedTokens.forEach((token, index) => {
108+
// New issue: this mangles the space IDs! We almost certainly need some
109+
// sort of proper map to the source token.
110+
const searchModule = new QuotientNodeFinalizer(token.searchModule, index == orderedTokens.length - 1);
111+
this.tokenLookupMap.set(searchModule.spaceId, token);
106112
if(!filterClosure(token)) {
107113
this._uncorrectables.push(searchModule);
108114
} else if(index == tailCorrectionLength - 1) {
@@ -116,7 +122,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
116122
const uncorrectables = this._uncorrectables;
117123
uncorrectables.forEach((uncorrectable) => {
118124
const lockedResult = uncorrectable.bestExample;
119-
this._lockedTokenResults.set(uncorrectable, {
125+
this._lockedTokenResults.set(uncorrectable.spaceId, {
120126
matchString: lockedResult.text,
121127
inputSamplingCost: 0,
122128
knownCost: -Math.log(lockedResult.p),
@@ -127,7 +133,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
127133
let totalCost = uncorrectables.reduce((accum, curr) => accum - Math.log(curr.bestExample.p), 0);
128134
const tokenCostMap = this.tokenCostMap = new Map<number, number>();
129135

130-
const correctablesToQueue = this._correctables.concat(this.predictableToken?.searchModule ?? []);
136+
const correctablesToQueue = this._correctables.concat(this._predictable ?? []);
131137
correctablesToQueue.forEach((t) => {
132138
totalCost += t.currentCost;
133139
tokenCostMap.set(t.spaceId, t.currentCost);
@@ -137,7 +143,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
137143

138144
// Compute a weighting for each token's search space based the increase in
139145
// tokenization cost that it represents.
140-
const tokenUpdateCost = (searchModule: SearchQuotientNode) => searchModule.currentCost - (tokenCostMap.get(searchModule.spaceId) ?? 0)
146+
const tokenUpdateCost = (searchModule: QuotientNodeFinalizer) => searchModule.currentCost - (tokenCostMap.get(searchModule.spaceId) ?? 0)
141147
this.selectionQueue = new PriorityQueue((a, b) => {
142148
const aUpdateCost = tokenUpdateCost(a);
143149
const bUpdateCost = tokenUpdateCost(b);
@@ -150,12 +156,15 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
150156
this.selectionQueue.enqueueAll(correctablesToQueue);
151157
}
152158

153-
private getUpdatedTotalCost(updatedCorrectable: SearchQuotientNode, tokenCost: number): number {
159+
private getUpdatedTotalCost(updatedCorrectable: QuotientNodeFinalizer, tokenCost: number): number {
154160
return this.lastTotalCost + tokenCost - (this.tokenCostMap.get(updatedCorrectable.spaceId) ?? 0);
155161
}
156162

157163
private collateResults(): TokenizationResultMapping {
158-
return new TokenizationResultMapping(this.orderedTokens.map((t) => this._lockedTokenResults.get(t.searchModule)), this);
164+
// The tokenLookupMap was constructed in the same ordering as the tokens; we can iterate the keys
165+
// or entries to keep everything in order.
166+
const results = [...this.tokenLookupMap.keys()].map((spaceId) => this._lockedTokenResults.get(spaceId))
167+
return new TokenizationResultMapping(results, this);
159168
}
160169

161170
handleNextNode(): PathResult<TokenizationResultMapping> {
@@ -218,7 +227,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
218227
}
219228

220229
// Either way, update the token -> correction-string map with the obtained result.
221-
this._lockedTokenResults.set(correctableToUpdate, tokenResult.mapping);
230+
this._lockedTokenResults.set(correctableToUpdate.spaceId, tokenResult.mapping);
222231

223232
// If we have a correction for all components in need of correction, allow
224233
// searching for alternative corrections for the 'unbound' token.

web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'
1717
import {
1818
ContextToken,
1919
ContextTokenization,
20+
correction,
2021
correctionValidForAutoSelect,
2122
generateSubsetId,
23+
getBestMatches,
2224
LegacyQuotientSpur,
2325
models,
2426
PathInputProperties,
@@ -41,6 +43,10 @@ const plainModel = new TrieModel(
4143
}
4244
);
4345

46+
function buildTestTimer() {
47+
return new correction.ExecutionTimer(Number.MAX_VALUE, Number.MAX_VALUE);
48+
}
49+
4450
function buildFixture_therefore() {
4551
let ID_SEED = 11;
4652

@@ -420,5 +426,42 @@ describe('TokenizationCorrector', () => {
420426
const nilResult = instance.handleNextNode();
421427
assert.equal(nilResult.type, 'none');
422428
});
429+
430+
describe('with getBestMatches()', () => {
431+
it('finds results from each of two tokenization variants sharing lower-level SearchQuotientNodes', async () => {
432+
const fixture = buildFixture_therefore();
433+
434+
// Issue: `theref` is built completely off of the multi-token's `the` -
435+
// and so starting on `the` later will fail! It "already produced" the
436+
// result, after all.
437+
//
438+
// ... which is ANOTHER benefit to a prospective TerminalQuotientNode.
439+
// It always gets a chance to process it!
440+
const tokenizations = [fixture.theref, fixture.the_ef];
441+
const correctors = tokenizations.map((t) => new TokenizationCorrector(t, t.tokens.length, fixture.filter));
442+
443+
let haveSeenSingleTokenCorrection = false;
444+
let haveSeenThreeTokenCorrection = false;
445+
for await(let phraseMatch of getBestMatches<
446+
ReadonlyArray<TokenResult>,
447+
TokenizationResultMapping,
448+
TokenizationCorrector
449+
>(correctors, buildTestTimer())) {
450+
451+
if(phraseMatch.matchedResult.length == 1) {
452+
haveSeenSingleTokenCorrection = true;
453+
} else if(phraseMatch.matchedResult.length == 3) {
454+
haveSeenThreeTokenCorrection = true;
455+
}
456+
457+
if(haveSeenSingleTokenCorrection && haveSeenThreeTokenCorrection) {
458+
break;
459+
}
460+
}
461+
462+
assert.isTrue(haveSeenSingleTokenCorrection, 'A single-token correction was expected but not found');
463+
assert.isTrue(haveSeenThreeTokenCorrection, 'A three-token correction was expected but not found');
464+
});
465+
});
423466
});
424467
});

0 commit comments

Comments
 (0)