diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index e119ad63c80..c6b7cfea246 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -187,14 +187,15 @@ export class ContextState { * * May also contain a single entry for applying Suggestions or when correction behavior * is disabled. - * @param isApplyingSuggestion When true, alters behavior to better model application of suggestions. + * @param appliedSuggestionId When defined, notes the original transition ID corresponding to + * the applied suggestion. * @returns */ analyzeTransition( context: Context, transformDistribution: Distribution, // overrides checks for token substitution that can fail for large applied suggestions. - isApplyingSuggestion?: boolean + appliedSuggestionId?: number ): ContextTransition { const lexicalModel = this.model; @@ -249,7 +250,8 @@ export class ContextState { // into subsets. const bestProb = transformDistribution.reduce((best, curr) => Math.max(best, curr.p), 0); // Should gain one per subsetBuilder.subsets entry. - const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, lexicalModel, trueInput, bestProb); + const realignedTokenization = baseTokenization.realign(tokenizationAnalysis.alignment); + const resultTokenization = realignedTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb, appliedSuggestionId); // ------------ diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index 0fbc3494a6a..f30cf39b190 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -105,11 +105,15 @@ export class ContextTokenization { * The sequence of tokens in the context represented by this instance. */ readonly tokens: ContextToken[]; + /** - * The tokenization-transition metadata relating this instance to the most likely - * tokenization from a prior state. + * Denotes whether or not the transition to this tokenization added or deleted + * any tokens. */ - readonly transitionEdits?: TransitionEdge; + readonly transitionEdits?: { + addedNewTokens: boolean, + removedOldTokens: boolean + }; /** * The portion of edits from the true input keystroke that are not part of the @@ -129,13 +133,18 @@ export class ContextTokenization { constructor(tokens: ContextToken[], alignment: TransitionEdge, taillessTrueKeystroke: Transform); constructor( param1: ContextToken[] | ContextTokenization, - alignment?: TransitionEdge, + tokenizationPath?: TransitionEdge, taillessTrueKeystroke?: Transform ) { if(!(param1 instanceof ContextTokenization)) { const tokens = param1; this.tokens = [].concat(tokens); - this.transitionEdits = alignment; + if(tokenizationPath) { + this.transitionEdits = { + addedNewTokens: tokenizationPath?.inputs[0].sample.has(1) ?? false, + removedOldTokens: (tokenizationPath?.alignment.removedTokenCount ?? 0) > 0 + } + } this.taillessTrueKeystroke = taillessTrueKeystroke; } else { const priorToClone = param1; @@ -489,30 +498,16 @@ export class ContextTokenization { /** * Given results from `precomputeTokenizationAfterInput`, this method will - * evaluate the pending transition in tokenization for all associated inputs + * realign this tokenization's range to match the incoming keystroke's context window * while reusing as many correction-search intermediate results as possible. - * @param transitionEdge Batched results from one or more + * @param alignment Batched results from one or more * `precomputeTokenizationAfterInput` calls on this instance, all with the * same alignment values. - * @param lexicalModel The active lexical model - * @param sourceInput The Transform associated with the keystroke triggering - * the transition. - * @param bestProbFromSet The probability of the single most likely input - * transform in the overall transformDistribution associated with the - * keystroke triggering the transition. It need not be represented by the - * TransitionEdge to be built. * @returns */ - evaluateTransition( - transitionEdge: TransitionEdge, - lexicalModel: LexicalModel, - sourceInput: Transform, - bestProbFromSet: number - ): ContextTokenization { - const { alignment: alignment, inputs } = transitionEdge; + realign(alignment: TransitionEdgeAlignment): ContextTokenization { const sliceIndex = alignment.edgeWindow.sliceIndex; const baseTokenization = this.tokens.slice(sliceIndex); - let affectedToken: ContextToken; const tokenization: ContextToken[] = []; @@ -553,33 +548,71 @@ export class ContextTokenization { tokenization.push(token); } + return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tokenization), null, this.taillessTrueKeystroke); + } + + /** + * Given results from `precomputeTokenizationAfterInput`, this method will + * evaluate the pending transition in tokenization for all associated inputs + * while reusing as many correction-search intermediate results as possible. + * @param transitionEdge Batched results from one or more + * `precomputeTokenizationAfterInput` calls on this instance, all with the + * same alignment values. + * @param transitionId The id of the Transform associated with the keystroke + * triggering the transition. + * @param bestProbFromSet The probability of the single most likely input + * transform in the overall transformDistribution associated with the + * keystroke triggering theh transition. It need not be represented by the + * tokenizationPath to be built. + * @param appliedSuggestionId + * @returns + */ + evaluateTransition( + transitionEdge: TransitionEdge, + transitionId: number, + bestProbFromSet: number, + appliedSuggestionId?: number + ): ContextTokenization { + const { alignment, inputs } = transitionEdge; + const sliceIndex = alignment.edgeWindow.sliceIndex; + const lexicalModel = this.tail.searchModule.model; + + let affectedToken: ContextToken; + + const tailTokenization = this.tokens.slice(sliceIndex); + // Assumption: inputs.length > 0. (There is at least one input transform.) const inputTransformKeys = [...inputs[0].sample.keys()]; + const baseTailIndex = (tailTokenization.length - 1); let removedTokenCount = alignment.removedTokenCount; while(removedTokenCount-- > 0) { inputTransformKeys.pop(); - tokenization.pop(); + tailTokenization.pop(); } let appliedLength = 0; for(let i = 0; i < inputTransformKeys.length; i++) { const tailRelativeIndex = inputTransformKeys[i]; let distribution = inputs.map((i) => ({sample: i.sample.get(tailRelativeIndex), p: i.p})); - const tokenIndex = (tokenization.length - 1) + tailRelativeIndex; + const tokenIndex = baseTailIndex + tailRelativeIndex; - affectedToken = tokenization[tokenIndex]; + affectedToken = tailTokenization[tokenIndex]; if(!affectedToken) { affectedToken = new ContextToken(lexicalModel); - tokenization.push(affectedToken); + tailTokenization.push(affectedToken); } else if(KMWString.length(affectedToken.exampleInput) == distribution[0].sample.deleteLeft) { // If the entire token will be replaced, throw out the old one and start anew. affectedToken = new ContextToken(lexicalModel); // Replace the token at the affected index with a brand-new token. - tokenization.splice(tokenIndex, 1, affectedToken); + tailTokenization.splice(tokenIndex, 1, affectedToken); } affectedToken.isPartial = true; - delete affectedToken.appliedTransitionId; + if(appliedSuggestionId !== undefined) { + affectedToken.appliedTransitionId = appliedSuggestionId; + } else { + delete affectedToken.appliedTransitionId; + } // If we are completely replacing a token via delete left, erase the deleteLeft; // that part applied to a _previous_ token that no longer exists. @@ -590,7 +623,7 @@ export class ContextTokenization { const inputSource: PathInputProperties = { segment: { - transitionId: sourceInput.id, + transitionId, start: appliedLength }, bestProbFromSet: bestProbFromSet, @@ -601,17 +634,21 @@ export class ContextTokenization { inputSource.segment.end = appliedLength; } + affectedToken = new ContextToken(affectedToken); affectedToken.addInput(inputSource, distribution); const tokenize = determineModelTokenizer(lexicalModel); affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false; + // Do not re-use the previous token; the mutation may have unexpected + // results (say, in unit-testing) + tailTokenization[tokenIndex] = affectedToken; affectedToken = null; } return new ContextTokenization( - this.tokens.slice(0, sliceIndex).concat(tokenization), - null /* tokenMapping */, + this.tokens.slice(0, sliceIndex).concat(tailTokenization), + transitionEdge, determineTaillessTrueKeystroke(transitionEdge) ); } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts index 2857c77fb76..f17e0a011f0 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts @@ -16,17 +16,6 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import Transform = LexicalModelTypes.Transform; -// Mark affected tokens with the applied-suggestion transition ID -// for easy future reference. -const tagTokens = (state: ContextState, suggestion: Suggestion) => { - const inputs = state.tokenization.transitionEdits.inputs; - const appliedTokenCount = inputs[0].sample.size; - const tokens = state.tokenization.tokens; - for(let i = tokens.length - appliedTokenCount; i < tokens.length; i++) { - tokens[i].appliedTransitionId = suggestion.transformId; - } -} - /** * Represents the transition between two context states as triggered * by input keystrokes or applied suggestions. @@ -145,15 +134,15 @@ export class ContextTransition { const buildAppliedTransition = ( transition: ContextTransition, baseState: ContextState, - transform: Transform + transform: Transform, + appliedTransitionId: number ) => { const state = baseState.analyzeTransition( baseState.context, [{sample: transform, p: 1}], - true + appliedTransitionId ).final; - tagTokens(state, suggestion); transition._final = state; // Applying a suggestion should not forget the original suggestion set. @@ -166,7 +155,7 @@ export class ContextTransition { // keystroke data. const resultTransition = new ContextTransition(this); - buildAppliedTransition(resultTransition, this.base, suggestion.transform); + buildAppliedTransition(resultTransition, this.base, suggestion.transform, suggestion.transformId); // An applied suggestion should replace the original Transition's effects, though keeping // the original input around. @@ -178,7 +167,7 @@ export class ContextTransition { } const finalTransition = new ContextTransition(resultTransition.final, suggestion.appendedTransform.id); - buildAppliedTransition(finalTransition, resultTransition.final, suggestion.appendedTransform); + buildAppliedTransition(finalTransition, resultTransition.final, suggestion.appendedTransform, suggestion.transformId); // The appended transform is applied with no intermediate input. finalTransition.final.appliedInput = { insert: '', deleteLeft: 0 }; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 43af11c08ec..0a9df50685f 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -346,14 +346,13 @@ export function determineSuggestionAlignment( const context = transition.base.context; const postContext = transition.final.context; const inputTransform = transition.inputDistribution[0].sample; - const inputTransformMap = transitionEdits?.inputs[0].sample; let deleteLeft: number; // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist. const wordbreak = determineModelWordbreaker(lexicalModel); // Is the token under construction newly-constructed / is there no pre-existing root? - if(tokenization.taillessTrueKeystroke && inputTransformMap?.has(1)) { + if(tokenization.taillessTrueKeystroke && transitionEdits?.addedNewTokens) { return { // If the new token is due to whitespace or due to a different input type // that would likely imply a tokenization boundary, infer 'new word' mode. @@ -366,7 +365,7 @@ export function determineSuggestionAlignment( deleteLeft: 0 }; // If the tokenized context length is shorter... sounds like a backspace (or similar). - } else if (transitionEdits?.alignment.removedTokenCount > 0) { + } else if (transitionEdits?.removedOldTokens) { /* Ooh, we've dropped context here. Almost certainly from a backspace or * similar effect. Even if we drop multiple tokens... well, we know exactly * how many chars were actually deleted - `inputTransform.deleteLeft`. Since diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index bcc1441fc0b..e16c8090ea9 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -130,7 +130,10 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens); assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' ')); assert.isOk(tokenization.transitionEdits); - assert.deepEqual(tokenization.transitionEdits, transitionEdits); + assert.deepEqual(tokenization.transitionEdits, { + addedNewTokens: false, + removedOldTokens: false + }); assert.equal(tokenization.tail.exampleInput, 'day'); assert.isFalse(tokenization.tail.isWhitespace); }); @@ -185,16 +188,133 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.exampleInput, rawTextTokens); }); + describe('realign', () => { + it('performs queued merge operations', () => { + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; + const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); + + const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\''].map((t) => ({text: t, isWhitespace: t == ' '})); + const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0, id: 42 }; + + const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); + const tokenization = baseTokenization.realign({ + merges: [{ + inputs: [{ + text: 'can', + index: 8 - edgeWindow.sliceIndex + }, { + text: '\'', + index: 9 - edgeWindow.sliceIndex + } + ], + match: { + text: 'can\'', + index: 8 - edgeWindow.sliceIndex + } + }], + splits: [], + unmappedEdits: [], + edgeWindow: { + ...edgeWindow, + // The range within the window constructed by the prior call for its parameterization. + retokenization: [...targetTokens.slice(edgeWindow.sliceIndex, -1).map(t => t.text), 'can\''] + }, + removedTokenCount: 0 + }); + + assert.isOk(tokenization); + assert.equal(tokenization.tokens.length, targetTokens.length); + + assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), + targetTokens + ); + + const basePreTail = baseTokenization.tokens[baseTokenization.tokens.length - 2]; + const baseTail = baseTokenization.tail; + assert.equal( + tokenization.tail.searchModule.inputCount, + basePreTail.searchModule.inputCount + baseTail.searchModule.inputCount + ); + assert.equal(tokenization.tail.exampleInput, 'can\''); + assert.deepEqual(tokenization.tail.searchModule.bestExample, { + text: basePreTail.searchModule.bestExample.text + baseTail.searchModule.bestExample.text, + p: basePreTail.searchModule.bestExample.p * baseTail.searchModule.bestExample.p + }); + }); + + it('performs queued split operations', () => { + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; + const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); + + const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\''].map((t) => ({text: t, isWhitespace: t == ' '})); + const inputTransform = { insert: '.', deleteLeft: 0, deleteRight: 0, id: 101 }; + const inputTransformMap: Map = new Map(); + // Lands after the split-off '\''. + inputTransformMap.set(1, { insert: '.', deleteLeft: 0 }); + + const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); + const tokenization = baseTokenization.realign({ + merges: [], + splits: [{ + matches: [{ + text: 'can', + index: 8 - edgeWindow.sliceIndex, + textOffset: 0 + }, { + text: '\'', + index: 9 - edgeWindow.sliceIndex, + textOffset: 3 + } + ], + input: { + text: 'can\'', + index: 8 - edgeWindow.sliceIndex + } + }], + unmappedEdits: [], + edgeWindow: { + ...edgeWindow, + // The range within the window constructed by the prior call for its parameterization. + retokenization: [...targetTokens.slice(edgeWindow.sliceIndex, -1).map(t => t.text)] + }, + removedTokenCount: 0 + }); + + assert.isOk(tokenization); + assert.equal(tokenization.tokens.length, targetTokens.length); + + assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), + targetTokens + ); + + const preTail = tokenization.tokens[tokenization.tokens.length - 2]; + const tail = tokenization.tail; + assert.equal( + baseTokenization.tail.searchModule.inputCount, + preTail.searchModule.inputCount + tail.searchModule.inputCount + ); + assert.equal(tail.searchModule.inputCount, 1); + // base tokenization did not include the '.' component. + assert.deepEqual((tail.searchModule as SearchQuotientSpur).lastInput, (baseTokenization.tail.searchModule as SearchQuotientSpur).lastInput); + assert.equal(preTail.exampleInput, 'can'); + assert.equal(tail.exampleInput, '\''); + assert.deepEqual({ + text: preTail.searchModule.bestExample.text + tail.searchModule.bestExample.text, + p: preTail.searchModule.bestExample.p * tail.searchModule.bestExample.p + }, baseTokenization.tail.searchModule.bestExample); + }); + }); + describe('evaluateTransition', () => { it('handles simple case - new whitespace + new empty token', () => { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', ''].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0, id: 11 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set(2, { insert: '', deleteLeft: 0 }); + inputTransformMap.set(1, { insert: ' ', deleteLeft: 0, id: 11 }); + inputTransformMap.set(2, { insert: '', deleteLeft: 0, id: 11 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -212,8 +332,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -266,8 +385,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -283,9 +401,9 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: 'y', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: 'y', deleteLeft: 0, deleteRight: 0, id: 13 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'y', deleteLeft: 0 }); + inputTransformMap.set(0, { insert: 'y', deleteLeft: 0, id: 13 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -304,8 +422,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -330,9 +447,9 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'week'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: 'week', deleteLeft: 3, deleteRight: 0 }; + const inputTransform = { insert: 'week', deleteLeft: 3, deleteRight: 0, id: 12 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'week', deleteLeft: 3 }); + inputTransformMap.set(0, { insert: 'week', deleteLeft: 3, id: 12 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -351,8 +468,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -376,7 +492,7 @@ describe('ContextTokenization', function() { (tokenization.tail.searchModule as SearchQuotientSpur).lastInput, // As we fully deleted the old token, the new one "starts" after the deleteLeft. // The deleteLeft component should not be included here. - [{sample: { insert: 'week', deleteLeft: 0 /* NOT 3 */ }, p: 1}] + [{sample: { insert: 'week', deleteLeft: 0 /* NOT 3 */, id: inputTransform.id }, p: 1}] ); }); @@ -408,8 +524,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -461,8 +576,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: subsetId }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -514,10 +628,10 @@ describe('ContextTokenization', function() { const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', ''].map((t) => ( {text: t, isWhitespace: t != '' && t.trim() == ''} )); - const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: ' ', deleteLeft: 0, deleteRight: 0, id: 42 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(-1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set( 0, { insert: '', deleteLeft: 0 }); + inputTransformMap.set(-1, { insert: ' ', deleteLeft: 0, id: 42 }); + inputTransformMap.set( 0, { insert: '', deleteLeft: 0, id: 42 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -535,8 +649,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - { insert: ' ', deleteLeft: 0 }, + inputTransform.id, 1 ); @@ -561,17 +674,20 @@ describe('ContextTokenization', function() { } }); - it.skip('handles case that triggers a token merge: can+\'+t', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; + it('handles case that triggers a token merge: can+\'+t', () => { + // Matches results from a pre-run .realign call; + // 'can' and '\'' would have been separate before it. + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'t'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0, id: 42 }; const inputTransformMap: Map = new Map(); inputTransformMap.set(0, inputTransform); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ + // matches the 'alignment' seen in realign "queued merge" test alignment: { merges: [{ inputs: [{ @@ -599,8 +715,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - { insert: 't', deleteLeft: 0 }, + inputTransform.id, 1 ); @@ -611,32 +726,34 @@ describe('ContextTokenization', function() { targetTokens ); - const basePreTail = baseTokenization.tokens[baseTokenization.tokens.length - 2]; const baseTail = baseTokenization.tail; assert.equal( tokenization.tail.searchModule.inputCount, - basePreTail.searchModule.inputCount + baseTail.searchModule.inputCount + 1 /* +1 - incoming transform */ + baseTail.searchModule.inputCount + 1 /* +1 - incoming transform */ ); assert.deepEqual((tokenization.tail.searchModule as SearchQuotientSpur).lastInput, [{ sample: inputTransform, p: 1 }]); assert.equal(tokenization.tail.exampleInput, 'can\'t'); assert.deepEqual(tokenization.tail.searchModule.bestExample, { - text: basePreTail.searchModule.bestExample.text + baseTail.searchModule.bestExample.text + inputTransform.insert, - p: basePreTail.searchModule.bestExample.p * baseTail.searchModule.bestExample.p * 1 /* prob of input transform */ + text: baseTail.searchModule.bestExample.text + inputTransform.insert, + p: baseTail.searchModule.bestExample.p * 1 /* prob of input transform */ }); }); - it.skip('handles case that triggers a token split: can\' +. => can, \', .', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; + it('handles case that triggers a token split: can\' +. => can, \', .', () => { + // Matches results from a pre-run .realign call; + // 'can' and '\'' would have been merged before it. + const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'', '.'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransform = { insert: '.', deleteLeft: 0, deleteRight: 0 }; + const inputTransform = { insert: '.', deleteLeft: 0, deleteRight: 0, id: 101 }; const inputTransformMap: Map = new Map(); // Lands after the split-off '\''. inputTransformMap.set(1, { insert: '.', deleteLeft: 0 }); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ + // matches the 'alignment' seen in realign "queued split" test alignment: { merges: [], splits: [{ @@ -666,8 +783,7 @@ describe('ContextTokenization', function() { inputs: [{ sample: inputTransformMap, p: 1 }], inputSubsetId: generateSubsetId() }, - plainModel, - inputTransform, + inputTransform.id, 1 ); @@ -682,9 +798,22 @@ describe('ContextTokenization', function() { const preTail = tokenization.tokens[tokenization.tokens.length - 2]; const tail = tokenization.tail; assert.equal( - baseTokenization.tail.searchModule.inputCount, - prepreTail.searchModule.inputCount + preTail.searchModule.inputCount + prepreTail.searchModule.inputCount, + baseTokenization.tokens[baseTokenization.tokens.length - 2].searchModule.inputCount + ); + assert.deepEqual( + prepreTail.searchModule.bestExample, + baseTokenization.tokens[baseTokenization.tokens.length - 2].searchModule.bestExample + ); + assert.equal( + preTail.searchModule.inputCount, + baseTokenization.tail.searchModule.inputCount ); + assert.deepEqual( + preTail.searchModule.bestExample, + baseTokenization.tail.searchModule.bestExample + ); + assert.equal(tail.searchModule.inputCount, 1); // base tokenization did not include the '.' component. assert.deepEqual((preTail.searchModule as SearchQuotientSpur).lastInput, (baseTokenization.tail.searchModule as SearchQuotientSpur).lastInput); @@ -692,10 +821,6 @@ describe('ContextTokenization', function() { assert.equal(prepreTail.exampleInput, 'can'); assert.equal(preTail.exampleInput, '\''); assert.equal(tail.exampleInput, '.'); - assert.deepEqual({ - text: prepreTail.searchModule.bestExample.text + preTail.searchModule.bestExample.text, - p: prepreTail.searchModule.bestExample.p * preTail.searchModule.bestExample.p - }, baseTokenization.tail.searchModule.bestExample); }); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts index 269f7d4a95c..844c5a24744 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/base-context-state.tests.ts @@ -137,7 +137,7 @@ describe('matchBaseContextState', () => { left: 'ot of test here might cause the sliding context window to shift ', startOfBuffer: false, // We're sliding now. endOfBuffer: true - }, [{sample: { insert: 'dramatically', deleteLeft: 0 }, p: 1}], true); + }, [{sample: { insert: 'dramatically', deleteLeft: 0 }, p: 1}], Math.random()); contextTracker.latest = transition; const warningSpy = sinon.spy(console, 'warn'); @@ -168,7 +168,7 @@ it('handles backward-sliding context after big delete', () => { left: 'ere might cause the sliding context window to shift dramatically', startOfBuffer: false, // We're sliding now. endOfBuffer: true - }, [{sample: { insert: '', deleteLeft: 'dramatically'.length }, p: 1}], true); + }, [{sample: { insert: '', deleteLeft: 'dramatically'.length }, p: 1}], Math.random()); contextTracker.latest = transition; const warningSpy = sinon.spy(console, 'warn');