diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index 380b5dbe404..093d4c7a6d5 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -262,30 +262,8 @@ export class ContextState { const nonEmptyTail = !tokens[lastIndex].isEmptyToken ? tokens[lastIndex] : tokens[lastIndex - 1]; const appliedSuggestionTransitionId = nonEmptyTail?.appliedTransitionId; - const postContext = transformDistribution?.[0] ? applyTransform(transformDistribution[0].sample, context) : context; - - // Note for future: the next line's pattern asserts that there is only one true tokenization. - // We may eventually allow for multiple potential tokenizations (per epic-dict-breaker) - const tokenizedContext = determineModelTokenizer(lexicalModel)(postContext).left; - if(tokenizedContext.length == 0) { - tokenizedContext.push({text: ''}); - } - // In which case we could try need to align for each of them, starting from the most likely. - - // If we're not at the start of the buffer, we're probably a sliding context. - const isSliding = !this.context.startOfBuffer; - - // It's possible the tokenization will remember more of the initial token than is - // actually present in the sliding context window, which imposes a need for a wide-band - // computeDistance 'radius' in the called function. - const alignmentResults = this.tokenization.computeAlignment(tokenizedContext.map((token) => token.text), isSliding, isApplyingSuggestion); - - // Stopgap: add tokenized transformSequenceDistribution to the alignment data & use that - // where noted: tagTokens() in context-transition.ts, `determineSuggestionAlignment()`. - - const state = new ContextState(applyTransform(trueInput, context), lexicalModel); - state.tokenization = new ContextTokenization(resultTokenization.tokens, alignmentResults, resultTokenization.taillessTrueKeystroke); + state.tokenization = new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke); state.appliedInput = transformDistribution?.[0].sample; transition.finalize(state, transformDistribution, resultTokenization.taillessTrueKeystroke); transition.revertableTransitionId = appliedSuggestionTransitionId; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index fa9971449b0..864164d3cbe 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -12,7 +12,6 @@ import { KMWString } from '@keymanapp/web-utils'; import { ContextToken } from './context-token.js'; import TransformUtils from '../transformUtils.js'; -import { computeAlignment, ContextStateAlignment } from './alignment-helpers.js'; import { computeDistance, EditOperation, EditTuple } from './classical-calculation.js'; import { determineModelTokenizer } from '../model-helpers.js'; import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js'; @@ -105,12 +104,11 @@ export class ContextTokenization { * The sequence of tokens in the context represented by this instance. */ readonly tokens: ContextToken[]; - /** * The tokenization-transition metadata relating this instance to the most likely * tokenization from a prior state. */ - readonly alignment?: ContextStateAlignment; + readonly transitionEdits?: PendingTokenization; /** * The portion of edits from the true input keystroke that are not part of the @@ -125,21 +123,21 @@ export class ContextTokenization { constructor(priorToClone: ContextTokenization); constructor(tokens: ContextToken[]); - constructor(tokens: ContextToken[], alignment: ContextStateAlignment, taillessTrueKeystroke: Transform); + constructor(tokens: ContextToken[], alignment: PendingTokenization, taillessTrueKeystroke: Transform); constructor( param1: ContextToken[] | ContextTokenization, - alignment?: ContextStateAlignment, + alignment?: PendingTokenization, taillessTrueKeystroke?: Transform ) { if(!(param1 instanceof ContextTokenization)) { const tokens = param1; this.tokens = [].concat(tokens); - this.alignment = alignment; + this.transitionEdits = alignment; this.taillessTrueKeystroke = taillessTrueKeystroke; } else { const priorToClone = param1; this.tokens = priorToClone.tokens.map((entry) => new ContextToken(entry)); - this.alignment = {...priorToClone.alignment}; + this.transitionEdits = {...priorToClone.transitionEdits}; this.taillessTrueKeystroke = priorToClone.taillessTrueKeystroke; } } @@ -169,20 +167,6 @@ export class ContextTokenization { return this.tokens.map(token => token.exampleInput); } - /** - * Determines the alignment between a new, incoming tokenization source and the - * tokenization modeled by the current instance. - * @param incomingTokenization Raw strings corresponding to the tokenization of the incoming context - * @param isSliding Notes if the context window is full (and sliding-alignment is particularly needed) - * @param noSubVerify When true, this disables inspection of 'substitute' transitions that avoids - * wholesale replacement of the original token. - * @returns Alignment data that details if and how the incoming tokenization aligns with - * the tokenization modeled by this instance. - */ - computeAlignment(incomingTokenization: string[], isSliding: boolean, noSubVerify?: boolean): ContextStateAlignment { - return computeAlignment(this.exampleInput, incomingTokenization, isSliding, noSubVerify); - } - /** * Applies the specified Transform to the _left-hand_ side of the context in * order to update and match the current contents of the sliding context diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts index 9cf752a232c..daf99a37197 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts @@ -16,12 +16,11 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import Transform = LexicalModelTypes.Transform; - // Mark affected tokens with the applied-suggestion transition ID // for easy future reference. const tagTokens = (state: ContextState, suggestion: Suggestion) => { - const alignment = state.tokenization.alignment - const appliedTokenCount = (alignment.canAlign && true) && (alignment.tailEditLength + Math.max(alignment.tailTokenShift, 0)); + const inputs = state.tokenization.transitionEdits.inputs; + const appliedTokenCount = inputs[0].sample.size; const tokens = state.tokenization.tokens; for(let i = tokens.length - appliedTokenCount; i < tokens.length; i++) { tokens[i].appliedTransitionId = suggestion.transformId; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 174d73ad585..d8221dde416 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -342,17 +342,18 @@ export function determineSuggestionAlignment( */ deleteLeft: number } { - const alignment = transition.final.tokenization.alignment; + const transitionEdits = transition.final.tokenization.transitionEdits; const context = transition.base.context; const postContext = transition.final.context; const inputTransform = transition.inputDistribution[0].sample; + const inputTransformMap = transitionEdits?.inputs[0].sample; let deleteLeft: number; // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist. const wordbreak = determineModelWordbreaker(lexicalModel); // Is the token under construction newly-constructed / is there no pre-existing root? - if(transition.preservationTransform && alignment?.canAlign && alignment.tailTokenShift > 0) { + if(transition.preservationTransform && inputTransformMap?.has(1)) { return { // If the new token is due to whitespace or due to a different input type // that would likely imply a tokenization boundary, infer 'new word' mode. @@ -365,7 +366,7 @@ export function determineSuggestionAlignment( deleteLeft: 0 }; // If the tokenized context length is shorter... sounds like a backspace (or similar). - } else if (alignment?.canAlign && alignment.tailTokenShift < 0) { + } else if (transitionEdits?.alignment.removedTokenCount > 0) { /* Ooh, we've dropped context here. Almost certainly from a backspace or * similar effect. Even if we drop multiple tokens... well, we know exactly * how many chars were actually deleted - `inputTransform.deleteLeft`. Since diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index cd396ab8a2f..e0fc4651bff 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -15,7 +15,7 @@ import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs' import { LexicalModelTypes } from '@keymanapp/common-types'; import { KMWString } from '@keymanapp/web-utils'; -import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextStateAlignment, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, models, traceInsertEdits } from '@keymanapp/lm-worker/test-index'; +import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, models, PendingTokenization, traceInsertEdits } from '@keymanapp/lm-worker/test-index'; import Transform = LexicalModelTypes.Transform; import TrieModel = models.TrieModel; @@ -61,6 +61,11 @@ function toMathematicalSMP(text: string) { return asSMP.join(''); } +const testEdgeWindowSpec = { + minTokens: 3, + minChars: 8 +}; + describe('ContextTokenization', function() { before(() => { KMWString.enableSupplementaryPlane(true); @@ -72,61 +77,66 @@ describe('ContextTokenization', function() { let tokenization = new ContextTokenization(rawTextTokens.map((text => toToken(text)))); assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens); assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' ')); - assert.isNotOk(tokenization.alignment); + assert.isNotOk(tokenization.transitionEdits); assert.equal(tokenization.tail.exampleInput, 'day'); assert.isFalse(tokenization.tail.isWhitespace); }); it("constructs from a token array + alignment data", () => { const rawTextTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day']; - let alignment: ContextStateAlignment = { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 6, - tailEditLength: 1, - tailTokenShift: 0 + const tokens = rawTextTokens.map((text => toTransformToken(text))); + const emptyTransform = { insert: '', deleteLeft: 0, deleteRight: 0 }; + + // We _could_ flesh this out a bit more... but it's not really needed for this test. + const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec); + let transitionEdits: PendingTokenization = { + alignment: { + merges: [], + splits: [], + unmappedEdits: [], + edgeWindow: {...edgeWindow, retokenization: rawTextTokens.slice(edgeWindow.sliceIndex)}, + removedTokenCount: 0 + }, + inputs: [{sample: (() => { + const map = new Map(); + map.set(0, emptyTransform); + return map; + })(), p: 1}] }; - let tokenization = new ContextTokenization(rawTextTokens.map((text => toToken(text))), alignment, null /* dummy val */); + let tokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); assert.deepEqual(tokenization.tokens.map((entry) => entry.exampleInput), rawTextTokens); assert.deepEqual(tokenization.tokens.map((entry) => entry.isWhitespace), rawTextTokens.map((entry) => entry == ' ')); - assert.isOk(tokenization.alignment); - assert.deepEqual(tokenization.alignment, alignment); + assert.isOk(tokenization.transitionEdits); + assert.deepEqual(tokenization.transitionEdits, transitionEdits); assert.equal(tokenization.tail.exampleInput, 'day'); assert.isFalse(tokenization.tail.isWhitespace); }); it('clones', () => { const rawTextTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day']; + const tokens = rawTextTokens.map((text => toTransformToken(text))); + const emptyTransform = { insert: '', deleteLeft: 0, deleteRight: 0 }; - let baseTokenization = new ContextTokenization(rawTextTokens.map((text => toToken(text))), { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 6, - tailEditLength: 1, - tailTokenShift: 0 - }, null /* dummy val */); + // We _could_ flesh this out a bit more... but it's not really needed for this test. + const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec); + let transitionEdits: PendingTokenization = { + alignment: { + merges: [], + splits: [], + unmappedEdits: [], + edgeWindow: {...edgeWindow, retokenization: rawTextTokens.slice(edgeWindow.sliceIndex)}, + removedTokenCount: 0 + }, + inputs: [{sample: (() => { + const map = new Map(); + map.set(0, emptyTransform); + return map; + })(), p: 1}] + }; + + let baseTokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); let cloned = new ContextTokenization(baseTokenization); @@ -154,11 +164,6 @@ describe('ContextTokenization', function() { }); describe('evaluateTransition', () => { - const testEdgeWindowSpec = { - minTokens: 3, - minChars: 8 - }; - it('handles simple case - new whitespace + new empty token', () => { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); @@ -631,17 +636,12 @@ describe('ContextTokenization', function() { describe('buildEdgeWindow', () => { describe('with min token count 3, char count 8', () => { - const editWindowSpec = { - minTokens: 3, - minChars: 8 - } - it('handles empty contexts', () => { const baseTokens = ['']; const idSeed = TOKEN_TRANSFORM_SEED; const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: '', editBoundary: { @@ -661,7 +661,7 @@ describe('ContextTokenization', function() { const idSeed = TOKEN_TRANSFORM_SEED; const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 2 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 2 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: '', editBoundary: { @@ -681,7 +681,7 @@ describe('ContextTokenization', function() { const idSeed = TOKEN_TRANSFORM_SEED; const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: 'an apple', editBoundary: { @@ -700,7 +700,7 @@ describe('ContextTokenization', function() { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'].map(s => toMathematicalSMP(s)); const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: toMathematicalSMP('an apple'), editBoundary: { @@ -721,7 +721,7 @@ describe('ContextTokenization', function() { const idSeed = TOKEN_TRANSFORM_SEED; const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 2 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 2 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: ' apple a', editBoundary: { @@ -740,7 +740,7 @@ describe('ContextTokenization', function() { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'].map(s => toMathematicalSMP(s)); const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 2 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 2 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: toMathematicalSMP(' apple a'), editBoundary: { @@ -761,7 +761,7 @@ describe('ContextTokenization', function() { const idSeed = TOKEN_TRANSFORM_SEED; const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 4 }, true, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 4 }, true, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: 'pple a day', editBoundary: { @@ -782,7 +782,7 @@ describe('ContextTokenization', function() { const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); baseTokenization.tail.isPartial = true; - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, false, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, false, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: 'apple a day', editBoundary: { @@ -802,7 +802,7 @@ describe('ContextTokenization', function() { const idSeed = TOKEN_TRANSFORM_SEED; const baseTokenization = new ContextTokenization(baseTokens.map(t => toTransformToken(t))); - const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, false, editWindowSpec); + const results = buildEdgeWindow(baseTokenization.tokens, { insert: '', deleteLeft: 0, deleteRight: 0 }, false, testEdgeWindowSpec); assert.deepEqual(results, { retokenizationText: 'apple a day ', editBoundary: { @@ -949,7 +949,7 @@ describe('ContextTokenization', function() { assert.equal(baseTexts.join('').length, 73); assert.equal(baseTexts.length, 25); - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); + const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t))); const resultTokenization = baseTokenization.applyContextSlide(plainModel, { insert: ' ', deleteLeft: 0, deleteRight: 9 }); @@ -964,7 +964,7 @@ describe('ContextTokenization', function() { "sauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem" ]; - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); + const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t))); const resultTokenization = baseTokenization.applyContextSlide(plainModel, { insert: 'apple', deleteLeft: 0, deleteRight: 0 }); @@ -981,7 +981,7 @@ describe('ContextTokenization', function() { "nd", " ", "orange", " ", "juice", " ", "seem", " ", "like", " ", "breakfast" ]; - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); + const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t))); const resultTokenization = baseTokenization.applyContextSlide(plainModel, { insert: 'applesauce a', deleteLeft: 0, deleteRight: 0 }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts index 0b2a58da47b..e5c511adb76 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts @@ -104,7 +104,7 @@ describe('determineContextTransition', () => { assert.equal(transition, tracker.latest); assert.isFalse(warningEmitterSpy.called); assert.sameOrderedMembers(transition.final.tokenization.exampleInput, ['this', ' ', 'is', ' ', 'for', ' ', 'techn']); - assert.isOk(transition.final.tokenization.alignment); + assert.isOk(transition.final.tokenization.transitionEdits); assert.equal(transition.final.context.left, targetContext.left); assert.equal(transition.final.context.right ?? "", targetContext.right ?? ""); assert.sameDeepOrderedMembers(transition.inputDistribution, inputDistribution);