From 24d1fc98bf913e4be85e7f8af5ad24c127c0ea7d Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 19 Nov 2025 14:19:53 -0600 Subject: [PATCH 1/2] refactor(web): split tokenization realignment from evaluateTransition With the various ways that tokenizations can transition depending upon which potential inputs are applied, it's possible for multiple different tokenizations to transition into the same one. As such, there will no longer be "just one" way that a tokenization is reached. Accordingly, it's best to perform word-boundary realignment operations (splits, merges) separately from text-editing operations (inserts, deletes). Build-bot: skip build:web Test-bot: skip --- .../src/main/correction/context-state.ts | 7 +- .../main/correction/context-tokenization.ts | 83 +++++++++++++------ .../src/main/correction/context-transition.ts | 21 ++--- .../worker-thread/src/main/predict-helpers.ts | 5 +- .../context/context-tokenization.tests.ts | 58 ++++++------- .../base-context-state.tests.ts | 4 +- 6 files changed, 97 insertions(+), 81 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index e119ad63c80..ac990defcb8 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -187,14 +187,15 @@ export class ContextState { * * May also contain a single entry for applying Suggestions or when correction behavior * is disabled. - * @param isApplyingSuggestion When true, alters behavior to better model application of suggestions. + * @param appliedSuggestionId When defined, notes the original transition ID corresponding to + * the applied suggestion. * @returns */ analyzeTransition( context: Context, transformDistribution: Distribution, // overrides checks for token substitution that can fail for large applied suggestions. - isApplyingSuggestion?: boolean + appliedSuggestionId?: number ): ContextTransition { const lexicalModel = this.model; @@ -249,7 +250,7 @@ export class ContextState { // into subsets. const bestProb = transformDistribution.reduce((best, curr) => Math.max(best, curr.p), 0); // Should gain one per subsetBuilder.subsets entry. - const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, lexicalModel, trueInput, bestProb); + const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb, appliedSuggestionId); // ------------ diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index 0fbc3494a6a..b97d7328a4f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -105,11 +105,15 @@ export class ContextTokenization { * The sequence of tokens in the context represented by this instance. */ readonly tokens: ContextToken[]; + /** - * The tokenization-transition metadata relating this instance to the most likely - * tokenization from a prior state. + * Denotes whether or not the transition to this tokenization added or deleted + * any tokens. */ - readonly transitionEdits?: TransitionEdge; + readonly transitionEdits?: { + addedNewTokens: boolean, + removedOldTokens: boolean + }; /** * The portion of edits from the true input keystroke that are not part of the @@ -129,13 +133,18 @@ export class ContextTokenization { constructor(tokens: ContextToken[], alignment: TransitionEdge, taillessTrueKeystroke: Transform); constructor( param1: ContextToken[] | ContextTokenization, - alignment?: TransitionEdge, + tokenizationPath?: TransitionEdge, taillessTrueKeystroke?: Transform ) { if(!(param1 instanceof ContextTokenization)) { const tokens = param1; this.tokens = [].concat(tokens); - this.transitionEdits = alignment; + if(tokenizationPath) { + this.transitionEdits = { + addedNewTokens: tokenizationPath?.inputs[0].sample.has(1) ?? false, + removedOldTokens: (tokenizationPath?.alignment.removedTokenCount ?? 0) > 0 + } + } this.taillessTrueKeystroke = taillessTrueKeystroke; } else { const priorToClone = param1; @@ -489,30 +498,16 @@ export class ContextTokenization { /** * Given results from `precomputeTokenizationAfterInput`, this method will - * evaluate the pending transition in tokenization for all associated inputs + * realign this tokenization's range to match the incoming keystroke's context window * while reusing as many correction-search intermediate results as possible. - * @param transitionEdge Batched results from one or more + * @param alignment Batched results from one or more * `precomputeTokenizationAfterInput` calls on this instance, all with the * same alignment values. - * @param lexicalModel The active lexical model - * @param sourceInput The Transform associated with the keystroke triggering - * the transition. - * @param bestProbFromSet The probability of the single most likely input - * transform in the overall transformDistribution associated with the - * keystroke triggering the transition. It need not be represented by the - * TransitionEdge to be built. * @returns */ - evaluateTransition( - transitionEdge: TransitionEdge, - lexicalModel: LexicalModel, - sourceInput: Transform, - bestProbFromSet: number - ): ContextTokenization { - const { alignment: alignment, inputs } = transitionEdge; + realign(alignment: TransitionEdgeAlignment): ContextTokenization { const sliceIndex = alignment.edgeWindow.sliceIndex; const baseTokenization = this.tokens.slice(sliceIndex); - let affectedToken: ContextToken; const tokenization: ContextToken[] = []; @@ -553,6 +548,40 @@ export class ContextTokenization { tokenization.push(token); } + return new ContextTokenization(tokenization, null, this.taillessTrueKeystroke); + } + + /** + * Given results from `precomputeTokenizationAfterInput`, this method will + * evaluate the pending transition in tokenization for all associated inputs + * while reusing as many correction-search intermediate results as possible. + * @param transitionEdge Batched results from one or more + * `precomputeTokenizationAfterInput` calls on this instance, all with the + * same alignment values. + * @param transitionId The id of the Transform associated with the keystroke + * triggering the transition. + * @param bestProbFromSet The probability of the single most likely input + * transform in the overall transformDistribution associated with the + * keystroke triggering theh transition. It need not be represented by the + * tokenizationPath to be built. + * @param appliedSuggestionId + * @returns + */ + evaluateTransition( + transitionEdge: TransitionEdge, + transitionId: number, + bestProbFromSet: number, + appliedSuggestionId?: number + ): ContextTokenization { + const { alignment, inputs } = transitionEdge; + const sliceIndex = alignment.edgeWindow.sliceIndex; + const lexicalModel = this.tail.searchModule.model; + + let affectedToken: ContextToken; + + const realignedTokenization = this.realign(alignment); + const tokenization = realignedTokenization.tokens; + // Assumption: inputs.length > 0. (There is at least one input transform.) const inputTransformKeys = [...inputs[0].sample.keys()]; let removedTokenCount = alignment.removedTokenCount; @@ -579,7 +608,11 @@ export class ContextTokenization { } affectedToken.isPartial = true; - delete affectedToken.appliedTransitionId; + if(appliedSuggestionId !== undefined) { + affectedToken.appliedTransitionId = appliedSuggestionId; + } else { + delete affectedToken.appliedTransitionId; + } // If we are completely replacing a token via delete left, erase the deleteLeft; // that part applied to a _previous_ token that no longer exists. @@ -590,7 +623,7 @@ export class ContextTokenization { const inputSource: PathInputProperties = { segment: { - transitionId: sourceInput.id, + transitionId, start: appliedLength }, bestProbFromSet: bestProbFromSet, @@ -611,7 +644,7 @@ export class ContextTokenization { return new ContextTokenization( this.tokens.slice(0, sliceIndex).concat(tokenization), - null /* tokenMapping */, + transitionEdge, determineTaillessTrueKeystroke(transitionEdge) ); } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts index 2857c77fb76..f17e0a011f0 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts @@ -16,17 +16,6 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import Transform = LexicalModelTypes.Transform; -// Mark affected tokens with the applied-suggestion transition ID -// for easy future reference. -const tagTokens = (state: ContextState, suggestion: Suggestion) => { - const inputs = state.tokenization.transitionEdits.inputs; - const appliedTokenCount = inputs[0].sample.size; - const tokens = state.tokenization.tokens; - for(let i = tokens.length - appliedTokenCount; i < tokens.length; i++) { - tokens[i].appliedTransitionId = suggestion.transformId; - } -} - /** * Represents the transition between two context states as triggered * by input keystrokes or applied suggestions. @@ -145,15 +134,15 @@ export class ContextTransition { const buildAppliedTransition = ( transition: ContextTransition, baseState: ContextState, - transform: Transform + transform: Transform, + appliedTransitionId: number ) => { const state = baseState.analyzeTransition( baseState.context, [{sample: transform, p: 1}], - true + appliedTransitionId ).final; - tagTokens(state, suggestion); transition._final = state; // Applying a suggestion should not forget the original suggestion set. @@ -166,7 +155,7 @@ export class ContextTransition { // keystroke data. const resultTransition = new ContextTransition(this); - buildAppliedTransition(resultTransition, this.base, suggestion.transform); + buildAppliedTransition(resultTransition, this.base, suggestion.transform, suggestion.transformId); // An applied suggestion should replace the original Transition's effects, though keeping // the original input around. @@ -178,7 +167,7 @@ export class ContextTransition { } const finalTransition = new ContextTransition(resultTransition.final, suggestion.appendedTransform.id); - buildAppliedTransition(finalTransition, resultTransition.final, suggestion.appendedTransform); + buildAppliedTransition(finalTransition, resultTransition.final, suggestion.appendedTransform, suggestion.transformId); // The appended transform is applied with no intermediate input. finalTransition.final.appliedInput = { insert: '', deleteLeft: 0 }; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 43af11c08ec..0a9df50685f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -346,14 +346,13 @@ export function determineSuggestionAlignment( const context = transition.base.context; const postContext = transition.final.context; const inputTransform = transition.inputDistribution[0].sample; - const inputTransformMap = transitionEdits?.inputs[0].sample; let deleteLeft: number; // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist. const wordbreak = determineModelWordbreaker(lexicalModel); // Is the token under construction newly-constructed / is there no pre-existing root? - if(tokenization.taillessTrueKeystroke && inputTransformMap?.has(1)) { + if(tokenization.taillessTrueKeystroke && transitionEdits?.addedNewTokens) { return { // If the new token is due to whitespace or due to a different input type // that would likely imply a tokenization boundary, infer 'new word' mode. @@ -366,7 +365,7 @@ export function determineSuggestionAlignment( deleteLeft: 0 }; // If the tokenized context length is shorter... sounds like a backspace (or similar). - } else if (transitionEdits?.alignment.removedTokenCount > 0) { + } else if (transitionEdits?.removedOldTokens) { /* Ooh, we've dropped context here. Almost certainly from a backspace or * similar effect. Even if we drop multiple tokens... well, we know exactly * how many chars were actually deleted - `inputTransform.deleteLeft`. Since diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index bcc1441fc0b..7340a257f8e 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -130,7 +130,10 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens); assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' ')); assert.isOk(tokenization.transitionEdits); - assert.deepEqual(tokenization.transitionEdits, transitionEdits); + assert.deepEqual(tokenization.transitionEdits, { + addedNewTokens: false, + removedOldTokens: false + }); assert.equal(tokenization.tail.exampleInput, 'day'); assert.isFalse(tokenization.tail.isWhitespace); }); @@ -191,10 +194,10 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', ''].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0, id: 11 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set(2, { insert: '', deleteLeft: 0 }); + inputTransformMap.set(1, { insert: ' ', deleteLeft: 0, id: 11 }); + inputTransformMap.set(2, { insert: '', deleteLeft: 0, id: 11 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -212,8 +215,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -266,8 +268,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -283,9 +284,9 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: 'y', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: 'y', deleteLeft: 0, deleteRight: 0, id: 13 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'y', deleteLeft: 0 }); + inputTransformMap.set(0, { insert: 'y', deleteLeft: 0, id: 13 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -304,8 +305,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -330,9 +330,9 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'week'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: 'week', deleteLeft: 3, deleteRight: 0 }; + const inputTransform = { insert: 'week', deleteLeft: 3, deleteRight: 0, id: 12 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'week', deleteLeft: 3 }); + inputTransformMap.set(0, { insert: 'week', deleteLeft: 3, id: 12 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -351,8 +351,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -376,7 +375,7 @@ describe('ContextTokenization', function() { (tokenization.tail.searchModule as SearchQuotientSpur).lastInput, // As we fully deleted the old token, the new one "starts" after the deleteLeft. // The deleteLeft component should not be included here. - [{sample: { insert: 'week', deleteLeft: 0 /* NOT 3 */ }, p: 1}] + [{sample: { insert: 'week', deleteLeft: 0 /* NOT 3 */, id: inputTransform.id }, p: 1}] ); }); @@ -408,8 +407,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -461,8 +459,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: subsetId }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -514,10 +511,10 @@ describe('ContextTokenization', function() { const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', ''].map((t) => ( {text: t, isWhitespace: t != '' && t.trim() == ''} )); - const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0, id: 42 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(-1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set( 0, { insert: '', deleteLeft: 0 }); + inputTransformMap.set(-1, { insert: ' ', deleteLeft: 0, id: 42 }); + inputTransformMap.set( 0, { insert: '', deleteLeft: 0, id: 42 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -535,8 +532,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - { insert: ' ', deleteLeft: 0 }, + inputTransform.id, 1 ); @@ -566,7 +562,7 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'t'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0, id: 42 }; const inputTransformMap: Map = new Map(); inputTransformMap.set(0, inputTransform); @@ -599,8 +595,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - { insert: 't', deleteLeft: 0 }, + inputTransform.id, 1 ); @@ -630,7 +625,7 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'', '.'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: '.', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: '.', deleteLeft: 0, deleteRight: 0, id: 101 }; const inputTransformMap: Map = new Map(); // Lands after the split-off '\''. inputTransformMap.set(1, { insert: '.', deleteLeft: 0 }); @@ -666,8 +661,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts index 269f7d4a95c..844c5a24744 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts @@ -137,7 +137,7 @@ describe('matchBaseContextState', () => { left: 'ot of test here might cause the sliding context window to shift ', startOfBuffer: false, // We're sliding now. endOfBuffer: true - }, [{sample: { insert: 'dramatically', deleteLeft: 0 }, p: 1}], true); + }, [{sample: { insert: 'dramatically', deleteLeft: 0 }, p: 1}], Math.random()); contextTracker.latest = transition; const warningSpy = sinon.spy(console, 'warn'); @@ -168,7 +168,7 @@ it('handles backward-sliding context after big delete', () => { left: 'ere might cause the sliding context window to shift dramatically', startOfBuffer: false, // We're sliding now. endOfBuffer: true - }, [{sample: { insert: '', deleteLeft: 'dramatically'.length }, p: 1}], true); + }, [{sample: { insert: '', deleteLeft: 'dramatically'.length }, p: 1}], Math.random()); contextTracker.latest = transition; const warningSpy = sinon.spy(console, 'warn'); From 4f257f522b0a88f8b90e328255556a1476542c33 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Thu, 20 Nov 2025 08:53:07 -0600 Subject: [PATCH 2/2] feat(web): adds .realign unit tests, restores split/merge ContextTokenization unit tests --- .../src/main/correction/context-state.ts | 3 +- .../main/correction/context-tokenization.ts | 22 ++- .../context/context-tokenization.tests.ts | 159 ++++++++++++++++-- 3 files changed, 160 insertions(+), 24 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index ac990defcb8..c6b7cfea246 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -250,7 +250,8 @@ export class ContextState { // into subsets. const bestProb = transformDistribution.reduce((best, curr) => Math.max(best, curr.p), 0); // Should gain one per subsetBuilder.subsets entry. - const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb, appliedSuggestionId); + const realignedTokenization = baseTokenization.realign(tokenizationAnalysis.alignment); + const resultTokenization = realignedTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb, appliedSuggestionId); // ------------ diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index b97d7328a4f..f30cf39b190 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -548,7 +548,7 @@ export class ContextTokenization { tokenization.push(token); } - return new ContextTokenization(tokenization, null, this.taillessTrueKeystroke); + return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tokenization), null, this.taillessTrueKeystroke); } /** @@ -579,32 +579,32 @@ export class ContextTokenization { let affectedToken: ContextToken; - const realignedTokenization = this.realign(alignment); - const tokenization = realignedTokenization.tokens; + const tailTokenization = this.tokens.slice(sliceIndex); // Assumption: inputs.length > 0. (There is at least one input transform.) const inputTransformKeys = [...inputs[0].sample.keys()]; + const baseTailIndex = (tailTokenization.length - 1); let removedTokenCount = alignment.removedTokenCount; while(removedTokenCount-- > 0) { inputTransformKeys.pop(); - tokenization.pop(); + tailTokenization.pop(); } let appliedLength = 0; for(let i = 0; i < inputTransformKeys.length; i++) { const tailRelativeIndex = inputTransformKeys[i]; let distribution = inputs.map((i) => ({sample: i.sample.get(tailRelativeIndex), p: i.p})); - const tokenIndex = (tokenization.length - 1) + tailRelativeIndex; + const tokenIndex = baseTailIndex + tailRelativeIndex; - affectedToken = tokenization[tokenIndex]; + affectedToken = tailTokenization[tokenIndex]; if(!affectedToken) { affectedToken = new ContextToken(lexicalModel); - tokenization.push(affectedToken); + tailTokenization.push(affectedToken); } else if(KMWString.length(affectedToken.exampleInput) == distribution[0].sample.deleteLeft) { // If the entire token will be replaced, throw out the old one and start anew. affectedToken = new ContextToken(lexicalModel); // Replace the token at the affected index with a brand-new token. - tokenization.splice(tokenIndex, 1, affectedToken); + tailTokenization.splice(tokenIndex, 1, affectedToken); } affectedToken.isPartial = true; @@ -634,16 +634,20 @@ export class ContextTokenization { inputSource.segment.end = appliedLength; } + affectedToken = new ContextToken(affectedToken); affectedToken.addInput(inputSource, distribution); const tokenize = determineModelTokenizer(lexicalModel); affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false; + // Do not re-use the previous token; the mutation may have unexpected + // results (say, in unit-testing) + tailTokenization[tokenIndex] = affectedToken; affectedToken = null; } return new ContextTokenization( - this.tokens.slice(0, sliceIndex).concat(tokenization), + this.tokens.slice(0, sliceIndex).concat(tailTokenization), transitionEdge, determineTaillessTrueKeystroke(transitionEdge) ); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index 7340a257f8e..e16c8090ea9 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -188,6 +188,123 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.exampleInput, rawTextTokens); }); + describe('realign', () => { + it('performs queued merge operations', () => { + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; + const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); + + const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\''].map((t) => ({text: t, isWhitespace: t == ' '})); + const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0, id: 42 }; + + const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); + const tokenization = baseTokenization.realign({ + merges: [{ + inputs: [{ + text: 'can', + index: 8 - edgeWindow.sliceIndex + }, { + text: '\'', + index: 9 - edgeWindow.sliceIndex + } + ], + match: { + text: 'can\'', + index: 8 - edgeWindow.sliceIndex + } + }], + splits: [], + unmappedEdits: [], + edgeWindow: { + ...edgeWindow, + // The range within the window constructed by the prior call for its parameterization. + retokenization: [...targetTokens.slice(edgeWindow.sliceIndex, -1).map(t => t.text), 'can\''] + }, + removedTokenCount: 0 + }); + + assert.isOk(tokenization); + assert.equal(tokenization.tokens.length, targetTokens.length); + + assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), + targetTokens + ); + + const basePreTail = baseTokenization.tokens[baseTokenization.tokens.length - 2]; + const baseTail = baseTokenization.tail; + assert.equal( + tokenization.tail.searchModule.inputCount, + basePreTail.searchModule.inputCount + baseTail.searchModule.inputCount + ); + assert.equal(tokenization.tail.exampleInput, 'can\''); + assert.deepEqual(tokenization.tail.searchModule.bestExample, { + text: basePreTail.searchModule.bestExample.text + baseTail.searchModule.bestExample.text, + p: basePreTail.searchModule.bestExample.p * baseTail.searchModule.bestExample.p + }); + }); + + it('performs queued split operations', () => { + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; + const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); + + const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\''].map((t) => ({text: t, isWhitespace: t == ' '})); + const inputTransform = { insert: '.', deleteLeft: 0, deleteRight: 0, id: 101 }; + const inputTransformMap: Map = new Map(); + // Lands after the split-off '\''. + inputTransformMap.set(1, { insert: '.', deleteLeft: 0 }); + + const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); + const tokenization = baseTokenization.realign({ + merges: [], + splits: [{ + matches: [{ + text: 'can', + index: 8 - edgeWindow.sliceIndex, + textOffset: 0 + }, { + text: '\'', + index: 9 - edgeWindow.sliceIndex, + textOffset: 3 + } + ], + input: { + text: 'can\'', + index: 8 - edgeWindow.sliceIndex + } + }], + unmappedEdits: [], + edgeWindow: { + ...edgeWindow, + // The range within the window constructed by the prior call for its parameterization. + retokenization: [...targetTokens.slice(edgeWindow.sliceIndex, -1).map(t => t.text)] + }, + removedTokenCount: 0 + }); + + assert.isOk(tokenization); + assert.equal(tokenization.tokens.length, targetTokens.length); + + assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), + targetTokens + ); + + const preTail = tokenization.tokens[tokenization.tokens.length - 2]; + const tail = tokenization.tail; + assert.equal( + baseTokenization.tail.searchModule.inputCount, + preTail.searchModule.inputCount + tail.searchModule.inputCount + ); + assert.equal(tail.searchModule.inputCount, 1); + // base tokenization did not include the '.' component. + assert.deepEqual((tail.searchModule as SearchQuotientSpur).lastInput, (baseTokenization.tail.searchModule as SearchQuotientSpur).lastInput); + assert.equal(preTail.exampleInput, 'can'); + assert.equal(tail.exampleInput, '\''); + assert.deepEqual({ + text: preTail.searchModule.bestExample.text + tail.searchModule.bestExample.text, + p: preTail.searchModule.bestExample.p * tail.searchModule.bestExample.p + }, baseTokenization.tail.searchModule.bestExample); + }); + }); + describe('evaluateTransition', () => { it('handles simple case - new whitespace + new empty token', () => { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day']; @@ -557,8 +674,10 @@ describe('ContextTokenization', function() { } }); - it.skip('handles case that triggers a token merge: can+\'+t', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; + it('handles case that triggers a token merge: can+\'+t', () => { + // Matches results from a pre-run .realign call; + // 'can' and '\'' would have been separate before it. + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'t'].map((t) => ({text: t, isWhitespace: t == ' '})); @@ -568,6 +687,7 @@ describe('ContextTokenization', function() { const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ + // matches the 'alignment' seen in realign "queued merge" test alignment: { merges: [{ inputs: [{ @@ -606,22 +726,23 @@ describe('ContextTokenization', function() { targetTokens ); - const basePreTail = baseTokenization.tokens[baseTokenization.tokens.length - 2]; const baseTail = baseTokenization.tail; assert.equal( tokenization.tail.searchModule.inputCount, - basePreTail.searchModule.inputCount + baseTail.searchModule.inputCount + 1 /* +1 - incoming transform */ + baseTail.searchModule.inputCount + 1 /* +1 - incoming transform */ ); assert.deepEqual((tokenization.tail.searchModule as SearchQuotientSpur).lastInput, [{ sample: inputTransform, p: 1 }]); assert.equal(tokenization.tail.exampleInput, 'can\'t'); assert.deepEqual(tokenization.tail.searchModule.bestExample, { - text: basePreTail.searchModule.bestExample.text + baseTail.searchModule.bestExample.text + inputTransform.insert, - p: basePreTail.searchModule.bestExample.p * baseTail.searchModule.bestExample.p * 1 /* prob of input transform */ + text: baseTail.searchModule.bestExample.text + inputTransform.insert, + p: baseTail.searchModule.bestExample.p * 1 /* prob of input transform */ }); }); - it.skip('handles case that triggers a token split: can\' +. => can, \', .', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; + it('handles case that triggers a token split: can\' +. => can, \', .', () => { + // Matches results from a pre-run .realign call; + // 'can' and '\'' would have been merged before it. + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'', '.'].map((t) => ({text: t, isWhitespace: t == ' '})); @@ -632,6 +753,7 @@ describe('ContextTokenization', function() { const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ + // matches the 'alignment' seen in realign "queued split" test alignment: { merges: [], splits: [{ @@ -676,9 +798,22 @@ describe('ContextTokenization', function() { const preTail = tokenization.tokens[tokenization.tokens.length - 2]; const tail = tokenization.tail; assert.equal( - baseTokenization.tail.searchModule.inputCount, - prepreTail.searchModule.inputCount + preTail.searchModule.inputCount + prepreTail.searchModule.inputCount, + baseTokenization.tokens[baseTokenization.tokens.length - 2].searchModule.inputCount ); + assert.deepEqual( + prepreTail.searchModule.bestExample, + baseTokenization.tokens[baseTokenization.tokens.length - 2].searchModule.bestExample + ); + assert.equal( + preTail.searchModule.inputCount, + baseTokenization.tail.searchModule.inputCount + ); + assert.deepEqual( + preTail.searchModule.bestExample, + baseTokenization.tail.searchModule.bestExample + ); + assert.equal(tail.searchModule.inputCount, 1); // base tokenization did not include the '.' component. assert.deepEqual((preTail.searchModule as SearchQuotientSpur).lastInput, (baseTokenization.tail.searchModule as SearchQuotientSpur).lastInput); @@ -686,10 +821,6 @@ describe('ContextTokenization', function() { assert.equal(prepreTail.exampleInput, 'can'); assert.equal(preTail.exampleInput, '\''); assert.equal(tail.exampleInput, '.'); - assert.deepEqual({ - text: prepreTail.searchModule.bestExample.text + preTail.searchModule.bestExample.text, - p: prepreTail.searchModule.bestExample.p * preTail.searchModule.bestExample.p - }, baseTokenization.tail.searchModule.bestExample); }); });