@@ -13,7 +13,7 @@ import { PriorityQueue } from "@keymanapp/web-utils";
1313import { ContextToken } from "./context-token.js" ;
1414import { CorrectionSearchable , PathResult } from "./correction-searchable.js" ;
1515import { ContextTokenization } from "./context-tokenization.js" ;
16- import { SearchQuotientNode } from "./search- quotient-node.js" ;
16+ import { QuotientNodeFinalizer } from "./quotient-node-finalizer .js" ;
1717import { TokenizationResultMapping } from "./tokenization-result-mapping.js" ;
1818
1919// PathResult needs to be generic:
@@ -31,13 +31,14 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
3131 public readonly tokenization : ContextTokenization ;
3232 private readonly tailCorrectionLength : number
3333
34- private readonly _uncorrectables : SearchQuotientNode [ ] ;
35- private readonly _correctables : SearchQuotientNode [ ] ;
36- private _predictable ?: SearchQuotientNode ;
34+ private readonly _uncorrectables : QuotientNodeFinalizer [ ] ;
35+ private readonly _correctables : QuotientNodeFinalizer [ ] ;
36+ private _predictable ?: QuotientNodeFinalizer ;
3737
38- private selectionQueue : PriorityQueue < SearchQuotientNode > ;
38+ private selectionQueue : PriorityQueue < QuotientNodeFinalizer > ;
3939 private tokenCostMap : Map < number , number > ;
40- private _lockedTokenResults : Map < SearchQuotientNode , TokenResult > ;
40+ private tokenLookupMap : Map < number , ContextToken > ;
41+ private _lockedTokenResults : Map < number , TokenResult > ;
4142 private lastTotalCost : number ;
4243 private handleHasBeenCalled : boolean = false ;
4344
@@ -55,20 +56,20 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
5556 }
5657
5758 get uncorrectableTokens ( ) : ReadonlyArray < ContextToken > {
58- return this . orderedTokens . filter ( ( t ) => this . _uncorrectables . find ( ( c ) => c . spaceId == t . spaceId ) ) ;
59+ return this . _uncorrectables . map ( ( c ) => this . tokenLookupMap . get ( c . spaceId ) ) ;
5960 }
6061
6162 get correctableTokens ( ) : ReadonlyArray < ContextToken > {
62- return this . orderedTokens . filter ( ( t ) => this . _correctables . find ( ( c ) => c . spaceId == t . spaceId ) ) ;
63+ return this . _correctables . map ( ( c ) => this . tokenLookupMap . get ( c . spaceId ) ) ;
6364 }
6465
6566 get predictableToken ( ) : ContextToken {
66- return this . orderedTokens . find ( ( t ) => this . _predictable ?. spaceId == t . spaceId ) ;
67+ return this . tokenLookupMap . get ( this . _predictable ?. spaceId ) ;
6768 }
6869
6970 get lockedTokenResults ( ) : ReadonlyMap < ContextToken , TokenResult > {
7071 return new Map ( [ ...this . _lockedTokenResults . entries ( ) ]
71- . map ( ( tuple ) => [ this . orderedTokens . find ( ( t ) => t . searchModule == tuple [ 0 ] ) , tuple [ 1 ] ] ) ) ;
72+ . map ( ( tuple ) => [ this . tokenLookupMap . get ( tuple [ 0 ] ) , tuple [ 1 ] ] ) ) ;
7273 }
7374
7475 // Will have actual result sequences.
@@ -96,13 +97,18 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
9697 throw new Error ( `Length for correction near tail may not be 0.` ) ;
9798 }
9899
99- const correctables = this . orderedTokens ;
100+ const orderedTokens = this . orderedTokens ;
100101
101102 this . _uncorrectables = [ ] ;
102103 this . _correctables = [ ] ;
103104
104- correctables . forEach ( ( token , index ) => {
105- const searchModule = token . searchModule ;
105+ this . tokenLookupMap = new Map ( ) ;
106+
107+ orderedTokens . forEach ( ( token , index ) => {
108+ // New issue: this mangles the space IDs! We almost certainly need some
109+ // sort of proper map to the source token.
110+ const searchModule = new QuotientNodeFinalizer ( token . searchModule , index == orderedTokens . length - 1 ) ;
111+ this . tokenLookupMap . set ( searchModule . spaceId , token ) ;
106112 if ( ! filterClosure ( token ) ) {
107113 this . _uncorrectables . push ( searchModule ) ;
108114 } else if ( index == tailCorrectionLength - 1 ) {
@@ -116,7 +122,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
116122 const uncorrectables = this . _uncorrectables ;
117123 uncorrectables . forEach ( ( uncorrectable ) => {
118124 const lockedResult = uncorrectable . bestExample ;
119- this . _lockedTokenResults . set ( uncorrectable , {
125+ this . _lockedTokenResults . set ( uncorrectable . spaceId , {
120126 matchString : lockedResult . text ,
121127 inputSamplingCost : 0 ,
122128 knownCost : - Math . log ( lockedResult . p ) ,
@@ -127,7 +133,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
127133 let totalCost = uncorrectables . reduce ( ( accum , curr ) => accum - Math . log ( curr . bestExample . p ) , 0 ) ;
128134 const tokenCostMap = this . tokenCostMap = new Map < number , number > ( ) ;
129135
130- const correctablesToQueue = this . _correctables . concat ( this . predictableToken ?. searchModule ?? [ ] ) ;
136+ const correctablesToQueue = this . _correctables . concat ( this . _predictable ?? [ ] ) ;
131137 correctablesToQueue . forEach ( ( t ) => {
132138 totalCost += t . currentCost ;
133139 tokenCostMap . set ( t . spaceId , t . currentCost ) ;
@@ -137,7 +143,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
137143
138144 // Compute a weighting for each token's search space based the increase in
139145 // tokenization cost that it represents.
140- const tokenUpdateCost = ( searchModule : SearchQuotientNode ) => searchModule . currentCost - ( tokenCostMap . get ( searchModule . spaceId ) ?? 0 )
146+ const tokenUpdateCost = ( searchModule : QuotientNodeFinalizer ) => searchModule . currentCost - ( tokenCostMap . get ( searchModule . spaceId ) ?? 0 )
141147 this . selectionQueue = new PriorityQueue ( ( a , b ) => {
142148 const aUpdateCost = tokenUpdateCost ( a ) ;
143149 const bUpdateCost = tokenUpdateCost ( b ) ;
@@ -150,12 +156,15 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
150156 this . selectionQueue . enqueueAll ( correctablesToQueue ) ;
151157 }
152158
153- private getUpdatedTotalCost ( updatedCorrectable : SearchQuotientNode , tokenCost : number ) : number {
159+ private getUpdatedTotalCost ( updatedCorrectable : QuotientNodeFinalizer , tokenCost : number ) : number {
154160 return this . lastTotalCost + tokenCost - ( this . tokenCostMap . get ( updatedCorrectable . spaceId ) ?? 0 ) ;
155161 }
156162
157163 private collateResults ( ) : TokenizationResultMapping {
158- return new TokenizationResultMapping ( this . orderedTokens . map ( ( t ) => this . _lockedTokenResults . get ( t . searchModule ) ) , this ) ;
164+ // The tokenLookupMap was constructed in the same ordering as the tokens; we can iterate the keys
165+ // or entries to keep everything in order.
166+ const results = [ ...this . tokenLookupMap . keys ( ) ] . map ( ( spaceId ) => this . _lockedTokenResults . get ( spaceId ) )
167+ return new TokenizationResultMapping ( results , this ) ;
159168 }
160169
161170 handleNextNode ( ) : PathResult < TokenizationResultMapping > {
@@ -218,7 +227,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
218227 }
219228
220229 // Either way, update the token -> correction-string map with the obtained result.
221- this . _lockedTokenResults . set ( correctableToUpdate , tokenResult . mapping ) ;
230+ this . _lockedTokenResults . set ( correctableToUpdate . spaceId , tokenResult . mapping ) ;
222231
223232 // If we have a correction for all components in need of correction, allow
224233 // searching for alternative corrections for the 'unbound' token.
0 commit comments