diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index 5b59035e133..7c3565c7e89 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -17,7 +17,8 @@ import { ContextToken } from './context-token.js'; import { ContextTokenization } from './context-tokenization.js'; import { ContextTransition } from './context-transition.js'; import { determineModelTokenizer } from '../model-helpers.js'; -import { tokenizeAndFilterDistribution } from './transform-tokenization.js'; +import { legacySubsetKeyer, TokenizationSubsetBuilder } from './tokenization-subsets.js'; +import TransformUtils from '../transformUtils.js'; import Context = LexicalModelTypes.Context; import Distribution = LexicalModelTypes.Distribution; @@ -197,57 +198,58 @@ export class ContextState { ): ContextTransition { const lexicalModel = this.model; - // Apply all transforms to the base context state - const transformSequenceDistribution = tokenizeAndFilterDistribution(context, lexicalModel, transformDistribution); - const postContext = transformDistribution?.[0] ? applyTransform(transformDistribution[0].sample, context) : context; - - // Note for future: the next line's pattern asserts that there is only one true tokenization. - // We may eventually allow for multiple potential tokenizations (per epic-dict-breaker) - const tokenizedContext = determineModelTokenizer(lexicalModel)(postContext).left; - if(tokenizedContext.length == 0) { - tokenizedContext.push({text: ''}); - } - // In which case we could try need to align for each of them, starting from the most likely. - - // If we're not at the start of the buffer, we're probably a sliding context. - const isSliding = !this.context.startOfBuffer; - - // It's possible the tokenization will remember more of the initial token than is - // actually present in the sliding context window, which imposes a need for a wide-band - // computeDistance 'radius' in the called function. - const alignmentResults = this.tokenization.computeAlignment(tokenizedContext.map((token) => token.text), isSliding, isApplyingSuggestion); + const trueInput = transformDistribution[0].sample; + const transition = new ContextTransition(this, this.appliedInput?.id); - if(alignmentResults.canAlign == false) { // Needs to be explicit for TS type inference. - if(console && console.error) { - console.error(`Could not align contexts with edit path ${JSON.stringify(alignmentResults.editPath)}`); - } - return null; - } + // From here on, we work toward the common-case - re-using old info when + // context (and its tokenization) is changed by an input Transform. - const resultTokenization = this.tokenization.transitionTo( - tokenizedContext, - alignmentResults, - lexicalModel, - transformSequenceDistribution - ); + let trueInputSubsetKey: string; + const slideUpdateTransform = determineContextSlideTransform(this.context, context); - if(!resultTokenization) { - if(console && console.error) { - console.error(`Transition to alignable tokenization failed: alignment properties ${JSON.stringify(alignmentResults)}`); - } - return null; - } + // Goal: allow multiple base tokenizations. + const startTokenizations = [this.tokenization]; + const startTokenizationsAfterSlide = startTokenizations.map(t => t.applyContextSlide(lexicalModel, slideUpdateTransform)); - const transition = new ContextTransition(this, this.appliedInput?.id); - // Occurs on context resets & after applying suggestions/reversions - if(resultTokenization == this.tokenization) { + // Easy case - no net change to the tokenizations whatsoever; the actual request + // aims to save-state the most recent results. + // + // This behavior occurs during context resets & after applying suggestions/reversions. + if(TransformUtils.isEmpty(trueInput) && transformDistribution.length == 1) { // If the tokenizations match, clone the ContextState; we want to preserve a post-application // context separately from pre-application contexts for predictions based on empty roots. const state = new ContextState(this); + state.tokenization = startTokenizationsAfterSlide[0]; transition.finalize(state, transformDistribution); return transition; } + const subsetBuilder = new TokenizationSubsetBuilder(legacySubsetKeyer); + for(let baseTokenization of startTokenizationsAfterSlide) { + + for(let mass of transformDistribution) { + const tokenizationAnalysis = baseTokenization.mapWhitespacedTokenization(lexicalModel, mass.sample); + subsetBuilder.addPrecomputation(baseTokenization, tokenizationAnalysis, mass.p); + + if(mass.sample == trueInput) { + trueInputSubsetKey = subsetBuilder.keyer(tokenizationAnalysis); + } + } + } + + // And now to (partly) detransform from a multiple-tokenization paradigm. + const trueInputSubset = subsetBuilder.subsets.get(trueInputSubsetKey); + // Right now, we only have one base tokenization, so we just fetch it. + const baseTokenization = startTokenizationsAfterSlide[0]; + // For multiple tokenizations, we'd retrieve each, use the "most likely" one as base, + // and then fold all resulting search spaces (on the final token) into one. + const tokenizationAnalysis = trueInputSubset.pendingSet.get(baseTokenization); + + // Should gain one per subsetBuilder.subsets entry. + const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, lexicalModel, trueInput); + + // ------------ + // So, if we have a suggestion transition ID at the end and didn't just apply... // we've just returned to the end of an applied suggestion's token. // @@ -272,28 +274,74 @@ export class ContextState { // We expect such cases to have SOMETHING for a preservation transform here; // we need to ensure that any suggestions for the new token believe that // the token is starting fresh, without any prior text. - if(alignmentResults.tailTokenShift > 0) { + // + // We actually will want to build `preservationTransform`s based on the path + // leading to each correction/suggestion. But, until now, we've just built + // it based upon the actual input transform - so we'll maintain (temporarily) + // as a transitional state. + + const bestResultAnalysis = tokenizationAnalysis; + // inputTransform is the ideal transform we found. + + // If tokens were inserted, emit an empty transform; this prevents + // suggestions from replacing the "current" token. + const bestTokenizedInput = bestResultAnalysis.inputs[0].sample; + if(bestTokenizedInput.size > 1 || bestTokenizedInput.has(1)) { preservationTransform = { insert: '', deleteLeft: 0 }; } - if(transformSequenceDistribution) { - const transformKeys = [...transformSequenceDistribution[0].sample.keys()]; - // Leave out the final entry - that part is replaceable by suggestions. - transformKeys.pop(); - - for(let i of transformKeys) { - const primaryInput = transformSequenceDistribution[0].sample.get(i); - if(!preservationTransform) { - preservationTransform = primaryInput; - } else { - preservationTransform.insert += primaryInput.insert; - preservationTransform.deleteLeft += primaryInput.deleteLeft; - } + const transformKeys = [...bestResultAnalysis.inputs[0].sample.keys()]; + transformKeys.pop(); + + for(let i of transformKeys) { + /* + * Thinking ahead to multitokenization: + * + * If what we have is not on the "true" tokenization, then... we need to + * do multitoken effects, right? We're basing new suggestions based on a + * state that does not currently exist! We'd need to enforce THAT state, + * *then* do the suggestion! + * - Which gets fun if we auto-apply such a case, as the new "true" tokenization + * no longer results directly from the true input. + * + * If we give tokens unique IDs on first creation, we could backtrace to + * find the most recent common ancestor. + * - simple cases (same 'token', but different input transform lengths/effects) + * will have the same prior token ID + */ + const primaryInput = bestResultAnalysis.inputs[0].sample.get(i); + if(!preservationTransform) { + preservationTransform = primaryInput; + } else { + preservationTransform.insert += primaryInput.insert; + preservationTransform.deleteLeft += primaryInput.deleteLeft; } } - const state = new ContextState(postContext, lexicalModel); - state.tokenization = resultTokenization; + const postContext = transformDistribution?.[0] ? applyTransform(transformDistribution[0].sample, context) : context; + + // Note for future: the next line's pattern asserts that there is only one true tokenization. + // We may eventually allow for multiple potential tokenizations (per epic-dict-breaker) + const tokenizedContext = determineModelTokenizer(lexicalModel)(postContext).left; + if(tokenizedContext.length == 0) { + tokenizedContext.push({text: ''}); + } + // In which case we could try need to align for each of them, starting from the most likely. + + // If we're not at the start of the buffer, we're probably a sliding context. + const isSliding = !this.context.startOfBuffer; + + // It's possible the tokenization will remember more of the initial token than is + // actually present in the sliding context window, which imposes a need for a wide-band + // computeDistance 'radius' in the called function. + const alignmentResults = this.tokenization.computeAlignment(tokenizedContext.map((token) => token.text), isSliding, isApplyingSuggestion); + + // Stopgap: add tokenized transformSequenceDistribution to the alignment data & use that + // where noted: tagTokens() in context-transition.ts, `determineSuggestionAlignment()`. + + + const state = new ContextState(applyTransform(trueInput, context), lexicalModel); + state.tokenization = new ContextTokenization(resultTokenization.tokens, alignmentResults); state.appliedInput = transformDistribution?.[0].sample; transition.finalize(state, transformDistribution, preservationTransform); transition.revertableTransitionId = appliedSuggestionTransitionId; @@ -315,7 +363,7 @@ export class ContextState { * @returns The substring prepended to the context (if sliding backward) or the * number of codepoints removed from its start (if sliding forward) */ -export function determineContextSlideTransform(srcContext: Context, dstContext: Context): Transform { +export function determineContextSlideTransform(srcContext: Context, dstContext: Context): Transform & { deleteRight: number } { // Assumption: the current (sliding) context window is alignable. // See `matchBaseContextState` in ../predict-helpers.ts. diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index e2b7fa06f87..345145c902e 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -7,7 +7,6 @@ * the sliding context window for one specific instance of context state. */ -import { Token } from '@keymanapp/models-templates'; import { LexicalModelTypes } from '@keymanapp/common-types'; import { KMWString } from '@keymanapp/web-utils'; @@ -19,9 +18,7 @@ import { determineModelTokenizer } from '../model-helpers.js'; import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js'; import { PendingTokenization } from './tokenization-subsets.js'; -import Distribution = LexicalModelTypes.Distribution; import LexicalModel = LexicalModelTypes.LexicalModel; -import ProbabilityMass = LexicalModelTypes.ProbabilityMass; import Transform = LexicalModelTypes.Transform; // May be able to "get away" with 2 & 5 or so, but having extra will likely help @@ -538,7 +535,9 @@ export class ContextTokenization { throw new Error("Not yet supported."); } - tokenization.push(new ContextToken(baseTokenization[i])); + const token = new ContextToken(baseTokenization[i]); + token.isPartial = false; + tokenization.push(token); } // Assumption: inputs.length > 0. (There is at least one input transform.) @@ -565,6 +564,9 @@ export class ContextTokenization { tokenization.splice(tokenIndex, 1, affectedToken); } + affectedToken.isPartial = true; + delete affectedToken.appliedTransitionId; + // If we are completely replacing a token via delete left, erase the deleteLeft; // that part applied to a _previous_ token that no longer exists. // We start at index 0 in the insert string for the "new" token. @@ -582,230 +584,6 @@ export class ContextTokenization { return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tokenization), null /* tokenMapping */); } - - /** - * Given an alignment between an incoming tokenization context and the current tokenization - * instance, this method will produce a new ContextTokenization instance for the incoming context - * that reuses as many correction-search intermediate results as possible. - * @param tokenizedContext A single tokenization for the incoming context that aligns well - * with the tokenization represented by the current instance. - * @param alignment The alignment, as determined by a prior call to `computeAlignment`. - * @param lexicalModel The active lexical model - * @param alignedTransformDistribution The tokenized version of the input distribution accounting - * for the difference between the context represented by this instance and that of `tokenizedContext`. - * @returns - */ - transitionTo( - tokenizedContext: Token[], - alignment: ContextStateAlignment, - lexicalModel: LexicalModel, - // FUTURE NOTE: especially for epic-dict-breaker, we'll want an array of these - to align across multiple transitions - // in case word boundaries shift back and forth. - alignedTransformDistribution: Distribution> - ): ContextTokenization { - if(!alignment.canAlign) { - return null; - } - - const { - leadTokenShift, - leadEditLength, - matchLength, - tailEditLength, - tailTokenShift - } = alignment; - const hasDistribution = alignedTransformDistribution?.length > 0; - - // If we have a perfect match with a pre-existing tokenization, no mutations - // have happened; just re-use the old context tokenization. - if(leadEditLength == 0 && leadTokenShift == 0 && tailTokenShift == 0 && tailEditLength == 0) { - // We must build a new instance in case the original did not have - // alignment data (like when it's the initial context!) - return new ContextTokenization(this.tokens, alignment); - } else { - // If we didn't get any input, we really should perfectly match - // a previous context state. If such a state is out of our cache, - // it should simply be rebuilt. - if(!hasDistribution) { - return null; - } - } - - // If mutations HAVE happened, we have work to do. - const tokenization = this.tokens.map((token) => new ContextToken(token)); - - if(leadTokenShift < 0) { - tokenization.splice(0, -leadTokenShift); - } else if(leadTokenShift > 0) { - // insert token(s) at the start to match the text that's back within the - // sliding context window. - - const reinsertedTokens = tokenizedContext.slice(0, leadTokenShift); - while(reinsertedTokens.length > 0) { - const reinserted = reinsertedTokens.pop(); - const token = new ContextToken(lexicalModel, reinserted.text); - tokenization.unshift(token); - } - } - - const incomingOffset = (leadTokenShift > 0 ? leadTokenShift : 0); - const matchingOffset = (leadTokenShift < 0 ? -leadTokenShift : 0); - - // If a word is being slid out of context-window range, start trimming it - we should - // no longer need to worry about reusing its original correction-search results. - for(let i = 0; i < leadEditLength; i++) { - if(this.tokens[matchingOffset+i].exampleInput != tokenizedContext[incomingOffset+i].text) { - //this.tokens[matchingOffset]'s clone is at tokenization[incomingOffset] - //after the splice call in a previous block. - tokenization[incomingOffset+i] = new ContextToken(lexicalModel, tokenizedContext[incomingOffset+i].text); - } - } - - // If no TAIL mutations have happened, we're safe to return now. - if(tailEditLength == 0 && tailTokenShift == 0) { - return new ContextTokenization(tokenization, alignment); - } - - // first non-matched tail index within the incoming context - const incomingTailUpdateIndex = matchLength + leadEditLength + incomingOffset; - // first non-matched tail index in `matchState`, the base context state. - const matchingTailUpdateIndex = matchLength + leadEditLength + matchingOffset; - - // The assumed input from the input distribution is always at index 0. - const tokenizedPrimaryInput = hasDistribution ? alignedTransformDistribution[0].sample : null; - - // now that we've identified the 'primary input', sort the distributions. - alignedTransformDistribution.sort((a, b) => b.p - a.p); - - // first index: original sample's tokenization - // second index: token index within original sample - const tokenDistribution = alignedTransformDistribution.map((entry) => { - const remap: Map> = new Map(); - - for(const pair of entry.sample.entries()) { - remap.set(pair[0], { - sample: pair[1], - p: entry.p - }); - } - - return remap; - }); - - // The original tail token should match index 0. If tokens have been deleted, that - // left-shifts our base indices; we start left of 0. If more than one token was - // edited, those edits occur to the left as well - and further left of whatever - // the new tail token is *if* tokens were removed. - const firstTailEditIndex = Math.min((1 - tailEditLength), 0) + Math.min(tailTokenShift, 0); - let primaryInputAppliedLen = 0; - for(let i = 0; i < tailEditLength; i++) { - const tailIndex = firstTailEditIndex + i; - - // do tail edits - const incomingIndex = i + incomingTailUpdateIndex; - const matchingIndex = i + matchingTailUpdateIndex; - - const incomingToken = tokenizedContext[incomingIndex]; - const matchedToken = this.tokens[matchingIndex]; - - let primaryInput = hasDistribution ? tokenizedPrimaryInput.get(tailIndex) : null; - const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput); - let token: ContextToken; - - if(isBackspace) { - token = new ContextToken(lexicalModel, incomingToken.text); - token.searchSpace.inputSequence.forEach((entry) => entry[0].sample.id = primaryInput.id); - } else { - // Assumption: there have been no intervening keystrokes since the last well-aligned context. - // (May not be valid with epic/dict-breaker or with complex, word-boundary crossing transforms) - token = new ContextToken(matchedToken); - - // Erase any applied-suggestion transition ID; it is no longer valid. - token.appliedTransitionId = undefined; - const emptySample: ProbabilityMass = { sample: { insert: '', deleteLeft: 0 }, p: 1 }; - const dist = tokenDistribution.map((seq) => seq.get(tailIndex) ?? emptySample); - token.addInput({trueTransform: primaryInput ?? emptySample.sample, inputStartIndex: primaryInputAppliedLen}, dist); - } - - tokenization[incomingIndex] = token; - primaryInputAppliedLen += KMWString.length(primaryInput?.insert ?? ''); - } - - if(tailTokenShift < 0) { - // delete tail tokens - for(let i = 0; i > tailTokenShift; i--) { - // If ALL that remains are deletes, we're good to go. - // - // This may not be the token at the index, but since all that remains are deletes, - // we'll have deleted the correct total number from the end once all iterations - // are done. - tokenization.pop(); - } - } else { - // First tail insertion index within the tokenized-transform map is always 1. - for(let i = 1; i <= tailTokenShift; i++) { - // create tail tokens - // same original base after all edited - const incomingIndex = incomingTailUpdateIndex + tailEditLength + (i - 1); - const incomingToken = tokenizedContext[incomingIndex]; - // // Assertion: there should be no matching token; this should be a newly-appended token. - // const matchingIndex = i + tailEditLength + matchingTailUpdateIndex; - - const primaryInput = hasDistribution ? tokenizedPrimaryInput.get(i) : null; - let pushedToken = new ContextToken(lexicalModel); - - // TODO: assumes that there was no shift in wordbreaking for the actual - // context when transitioning from the prior context to the current one. - // This may actually be a major issue for dictionary-based wordbreaking! - // - // If there was such a shift, then we may have extra transforms - // originally on a 'previous' token that got moved into this one! - // - // Suppose we're using a dictionary-based wordbreaker and have - // `butterfl` for our context, which could become butterfly. If the - // next keystroke results in `butterfli`, this would likely be - // tokenized `butter` `fli`. (e.g: `fli` leads to `flight`.) How do - // we know to properly relocate the `f` and `l` transforms? - - // Build a distribution for transforms aligned to the current token, - // then remove any empty / null / undefined entries. - let tokenDistribComponent = tokenDistribution.map((seq) => { - const entry = seq.get(i); - // Do not add empty Transforms into the correction-search input - // at this stage. - if(!entry || TransformUtils.isEmpty(entry.sample)) { - return null; - } else { - return entry; - } - }).filter((entry) => !!entry); - if(primaryInput) { - let transformDistribution = tokenDistribComponent.length > 0 ? tokenDistribComponent : null; - - // If there are no entries in our would-be distribution, there's no - // reason to pass in what amounts to a no-op. - if(transformDistribution) { - // If we ever stop filtering tokenized transform distributions, it may - // be worth adding an empty transform here with weight to balance - // the distribution back to a cumulative prob sum of 1. - pushedToken.addInput({ trueTransform: primaryInput, inputStartIndex: primaryInputAppliedLen }, transformDistribution); - } - } else if(incomingToken.text) { - // We have no transform data to match against an inserted token with text; abort! - // Refer to #12494 for an example case; we currently can't map previously-committed - // input transforms to a newly split-off token. - return null; - } - pushedToken.isWhitespace = incomingToken.isWhitespace; - - // Auto-replaces the search space to correspond with the new token. - tokenization.push(pushedToken); - primaryInputAppliedLen += KMWString.length(primaryInput.insert); - } - } - - return new ContextTokenization(tokenization, alignment); - } } const appendText = (full: string, current: string) => full + current; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts index c4e2962867d..d7cab30e463 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts @@ -181,7 +181,7 @@ export class TokenizationSubsetBuilder { readonly keyer: typeof precomputationSubsetKeyer; constructor(keyer?: typeof precomputationSubsetKeyer) { - this.keyer = precomputationSubsetKeyer; + this.keyer = keyer ?? precomputationSubsetKeyer; } addPrecomputation(tokenization: ContextTokenization, precomputation: TokenizationTransitionEdits, p: number) { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts index 5787097190a..d83915a3b5f 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts @@ -99,13 +99,13 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - // Phrased this way to facilitate TS type-inference; assert.isTrue() does - // NOT do this for us! - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // // Phrased this way to facilitate TS type-inference; assert.isTrue() does + // // NOT do this for us! + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); it("properly matches and aligns when no context changes occur (after whitespace)", function() { @@ -126,13 +126,13 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - // Phrased this way to facilitate TS type-inference; assert.isTrue() does - // NOT do this for us! - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // // Phrased this way to facilitate TS type-inference; assert.isTrue() does + // // NOT do this for us! + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); it("properly matches and aligns when lead token is removed (end of word)", function() { @@ -144,7 +144,7 @@ describe('ContextState', () => { deleteLeft: 0 }; let newContext = { - left: " apple a day keeps the doctor", startOfBuffer: true, endOfBuffer: true + left: " apple a day keeps the doctor", startOfBuffer: false, endOfBuffer: true }; let rawTokens = [" ", "apple", " ", "a", " ", "day", " ", "keeps", " ", "the", " ", "doctor"]; @@ -153,11 +153,11 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -1); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -1); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); it("properly matches and aligns when lead token is removed (after whitespace)", function() { @@ -169,7 +169,7 @@ describe('ContextState', () => { deleteLeft: 0 }; let newContext = { - left: " apple a day keeps the doctor ", startOfBuffer: true, endOfBuffer: true + left: " apple a day keeps the doctor ", startOfBuffer: false, endOfBuffer: true }; let rawTokens = [" ", "apple", " ", "a", " ", "day", " ", "keeps", " ", "the", " ", "doctor", " ", ""]; @@ -178,11 +178,11 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -1); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -1); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); it("properly matches and aligns when lead token + following whitespace are removed", function() { @@ -194,7 +194,7 @@ describe('ContextState', () => { deleteLeft: 0 }; let newContext = { - left: "apple a day keeps the doctor", startOfBuffer: true, endOfBuffer: true + left: "apple a day keeps the doctor", startOfBuffer: false, endOfBuffer: true }; let rawTokens = ["apple", " ", "a", " ", "day", " ", "keeps", " ", "the", " ", "doctor"]; @@ -203,11 +203,11 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -2); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -2); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); it("properly matches and aligns when final token is edited", function() { @@ -225,11 +225,11 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); // Needs improved context-state management (due to 2x tokens) @@ -253,13 +253,16 @@ describe('ContextState', () => { // The 'wordbreak' transform let state = newContextMatch?.final; assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputSequence); - assert.isEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence); + assert.sameDeepMembers( + state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence, + [[{sample: { insert: '', deleteLeft: 0 }, p: 1}]] + ); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); }); it("properly matches and aligns when whitespace before final empty token is extended", function() { @@ -287,11 +290,11 @@ describe('ContextState', () => { [[{ sample: {insert: '', deleteLeft: 0}, p: 1 }]] ); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); }); it("properly matches and aligns when a 'wordbreak' is removed via backspace", function() { @@ -309,11 +312,11 @@ describe('ContextState', () => { assert.isOk(newContextMatch?.final); assert.deepEqual(newContextMatch?.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, -2); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, -2); }); it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() { @@ -337,11 +340,11 @@ describe('ContextState', () => { assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputSequence); assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 1); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 1); }) // Needs improved context-state management (due to 2x tokens) @@ -354,7 +357,7 @@ describe('ContextState', () => { deleteLeft: 0 } let newContext = { - left: "apple a day keeps the doctor", startOfBuffer: true, endOfBuffer: true + left: "apple a day keeps the doctor", startOfBuffer: false, endOfBuffer: true }; let rawTokens = ["apple", " ", "a", " ", "day", " ", "keeps", " ", "the", " ", "doctor", " ", ""]; @@ -368,13 +371,16 @@ describe('ContextState', () => { // The 'wordbreak' transform let state = newContextMatch.final; assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputSequence); - assert.isEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence); + assert.deepEqual( + state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence, + [[{sample: {insert: '', deleteLeft: 0}, p: 1}]] + ); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -2); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, -2); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); }); it("properly matches and aligns when initial token is modified AND a 'wordbreak' is added'", function() { @@ -397,13 +403,16 @@ describe('ContextState', () => { // The 'wordbreak' transform let state = newContextMatch.final; assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputSequence); - assert.isEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence); + assert.deepEqual( + state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence, + [[{sample: {insert: '', deleteLeft: 0}, p: 1}]] + ); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); }); it("properly matches and aligns when tail token is modified AND a 'wordbreak' is added'", function() { @@ -426,16 +435,16 @@ describe('ContextState', () => { // The 'wordbreak' transform let state = newContextMatch.final; assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputSequence); - assert.isEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence); + assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputSequence); - if(!newContextMatch.final.tokenization.alignment.canAlign) { - assert.fail("context alignment failed"); - } - assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); + // if(!newContextMatch.final.tokenization.alignment.canAlign) { + // assert.fail("context alignment failed"); + // } + // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); + // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); }); - it('rejects hard-to-handle case: tail token is split into three rather than two', function() { + it('handles case where tail token is split into three rather than two', function() { let baseContext = models.tokenize(defaultBreaker, { left: "text'", startOfBuffer: true, endOfBuffer: true }); @@ -456,7 +465,9 @@ describe('ContextState', () => { deleteLeft: 0 } let problemContextMatch = baseState.analyzeTransition({left: "text'", startOfBuffer: true, endOfBuffer: true}, [{sample: transform, p: 1}]); - assert.isNull(problemContextMatch); + assert.isNotNull(problemContextMatch); + + assert.deepEqual(problemContextMatch.final.tokenization.exampleInput, ['text', '\'', '"']); }); }); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index ed8c270b6a5..0b44f9ffc52 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -153,524 +153,6 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.exampleInput, rawTextTokens); }); - describe('transitionTo', function() { - it('simple case - new whitespace + new empty token', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day']; - const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t)), null); - - const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', ''].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - inputTransformMap.set(1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set(2, { insert: '', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'insert', match: 7}, - {op: 'insert', match: 8} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 7, - tailEditLength: 0, - tailTokenShift: 2 - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), - targetTokens - ); - }); - - it('simple case - new character added to last token', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'da']; - const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t)), null); - - const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day'].map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'y', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'substitute', input: 6, match: 6} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 6, - tailEditLength: 1, - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1 }] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), - targetTokens - ); - }); - - it('merges new whitespace character added to last whitespace token if tail is empty', () => { - const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', '']; - const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t)), null); - - const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', ''].map((t) => ( - {text: t, isWhitespace: t != '' && t.trim() == ''} - )); - const inputTransformMap: Map = new Map(); - inputTransformMap.set(-1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set( 0, { insert: '', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'substitute', input: 7, match: 7}, - {op: 'substitute', input: 8, match: 8} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 7, - tailEditLength: 2, - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1 }] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), - targetTokens - ); - }); - - it('simple case - context-window slide deletes first char of word', () => { - // string length: 64 - const baseTexts = [ - "applesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", - "seem", " ", "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "" - ]; - assert.equal(baseTexts.join('').length, 64); - - assert.equal(baseTexts.length, 23); - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - "pplesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", - "seem", " ", "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "b" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'b', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'match', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'match', input: 10, match: 10}, - {op: 'match', input: 11, match: 11}, - {op: 'match', input: 12, match: 12}, - {op: 'match', input: 13, match: 13}, - {op: 'match', input: 14, match: 14}, - {op: 'match', input: 15, match: 15}, - {op: 'match', input: 16, match: 16}, - {op: 'match', input: 17, match: 17}, - {op: 'match', input: 18, match: 18}, - {op: 'match', input: 19, match: 19}, - {op: 'match', input: 20, match: 20}, - {op: 'match', input: 21, match: 21}, - {op: 'substitute', input: 22, match: 22} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 21, - tailEditLength: 1, - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - - it('context-window slide deletes majority of word', () => { - // string length: 73 - const baseTexts = [ - "applesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", - "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "best", " ", "brea" - ]; - assert.equal(baseTexts.join('').length, 73); - - assert.equal(baseTexts.length, 25); - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - "e", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", - "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "best", " ", "break" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 'k', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'match', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'match', input: 10, match: 10}, - {op: 'match', input: 11, match: 11}, - {op: 'match', input: 12, match: 12}, - {op: 'match', input: 13, match: 13}, - {op: 'match', input: 14, match: 14}, - {op: 'match', input: 15, match: 15}, - {op: 'match', input: 16, match: 16}, - {op: 'match', input: 17, match: 17}, - {op: 'match', input: 18, match: 18}, - {op: 'match', input: 19, match: 19}, - {op: 'match', input: 20, match: 20}, - {op: 'match', input: 21, match: 21}, - {op: 'match', input: 22, match: 22}, - {op: 'match', input: 23, match: 23}, - {op: 'substitute', input: 24, match: 24} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 23, - tailEditLength: 1, - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - - it('handles extension of head token from backward context-window slide', () => { - const baseTexts = [ - "sauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem" - ]; - - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - "applesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: '', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - // Yay for being able to mock the alignment data for the test! We don't - // actually need to use a full 64-char string as long as we craft this - // properly. - targetTokens, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'match', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'match', input: 10, match: 10} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: baseTexts.length - 1, - tailEditLength: 0, - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - - it('handles large backward context-window slide jump', () => { - // Note: this is not the actual pathway used for reverting suggestions, - // though the scenario is somewhat analogous. - const baseTexts = [ - "nd", " ", "orange", " ", "juice", " ", "seem", " ", "like", " ", "breakfast" - ]; - - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - "sauce", " ", "and", " ", "orange", " ", "juice", " ", "seem", " ", "like", " ", - "brea" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - - inputTransformMap.set(0, { insert: '', deleteLeft: 5 }); - - const tokenization = baseTokenization.transitionTo( - // Yay for being able to mock the alignment data for the test! We don't - // actually need to use a full 64-char string as long as we craft this - // properly. - targetTokens, { - canAlign: true, - editPath: [ - {op: 'insert', match: 0}, - {op: 'insert', match: 1}, - {op: 'match', input: 0, match: 2}, - {op: 'match', input: 1, match: 3}, - {op: 'match', input: 2, match: 4}, - {op: 'match', input: 3, match: 5}, - {op: 'match', input: 4, match: 6}, - {op: 'match', input: 5, match: 7}, - {op: 'match', input: 6, match: 8}, - {op: 'match', input: 7, match: 9}, - {op: 'match', input: 8, match: 10}, - {op: 'match', input: 9, match: 11}, - {op: 'match', input: 10, match: 12} - ], - leadTokenShift: 2, // "applesauce", " " - leadEditLength: 1, // "nd" / "and" - matchLength: baseTexts.length - 2, - tailEditLength: 1, // "breakfast" / "brea" - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.equal(tokenization.tail.exampleInput, "brea"); - assert.equal(tokenization.tail.searchSpace.inputSequence.length, "brea".length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - - it('handles word-break boundary shift during backward context-window slide', () => { - const baseTexts = [ - // Without any preceding adjacent char in view, we can only interpret - // the leading `'` as an opening single-quote. - /*isn*/ "'", "t", " ", "orange", " ", "juice", " ", "tasty", "?", " ", "I", " ", "think" - ]; - - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - // With one preceding adjacent non-whitespace char in view, we now - // realize it was part of a word... and remove a wordbreak! - /*is"*/ "n't", " ", "orange", " ", "juice", " ", "tasty", "?", " ", "I", " ", "thin" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - - inputTransformMap.set(0, { insert: '', deleteLeft: 1 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'substitute', input: 1, match: 0}, - {op: 'substitute', input: 2, match: 1}, - {op: 'substitute', input: 3, match: 2}, - {op: 'substitute', input: 4, match: 3}, - {op: 'substitute', input: 5, match: 4}, - {op: 'substitute', input: 6, match: 5}, - {op: 'substitute', input: 7, match: 6}, - {op: 'substitute', input: 8, match: 7}, - {op: 'match', input: 9, match: 8}, - {op: 'match', input: 10, match: 9}, - {op: 'match', input: 11, match: 10}, - {op: 'match', input: 12, match: 11} - ], - leadTokenShift: -1, // "'", - leadEditLength: 1, // "t" / "n't" - matchLength: baseTexts.length - 3, - tailEditLength: 1, // "think" / "thin" - tailTokenShift: 0 - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.equal(tokenization.tail.exampleInput, "thin"); - assert.equal(tokenization.tail.searchSpace.inputSequence.length, "thin".length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - - it('handles word-break boundary shifts at both ends during backward context-window slide', () => { - const baseTexts = [ - // Without any preceding adjacent char in view, we can only interpret - // the leading `'` as an opening single-quote. - /*isn*/ "'", "t", " ", "orange", " ", "juice", " ", "tasty", "?", " ", "I", " ", "find", " ", "" - ]; - - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - // With one preceding adjacent non-whitespace char in view, we now - // realize it was part of a word... and remove a wordbreak! - /*is"*/ "n't", " ", "orange", " ", "juice", " ", "tasty", "?", " ", "I", " ", "find" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - - inputTransformMap.set(0, { insert: '', deleteLeft: 1 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'substitute', input: 1, match: 0}, - {op: 'substitute', input: 2, match: 1}, - {op: 'substitute', input: 3, match: 2}, - {op: 'substitute', input: 4, match: 3}, - {op: 'substitute', input: 5, match: 4}, - {op: 'substitute', input: 6, match: 5}, - {op: 'substitute', input: 7, match: 6}, - {op: 'substitute', input: 8, match: 7}, - {op: 'match', input: 9, match: 8}, - {op: 'match', input: 10, match: 9}, - {op: 'match', input: 11, match: 10}, - {op: 'match', input: 12, match: 11}, - {op: 'delete', input: 13}, - {op: 'delete', input: 14} - ], - leadTokenShift: -1, // "'", - leadEditLength: 1, // "t" / "n't" - matchLength: baseTexts.length - 4, - tailEditLength: 0, - tailTokenShift: -2 // " ", "" - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.equal(tokenization.tail.exampleInput, "find"); - assert.equal(tokenization.tail.searchSpace.inputSequence.length, "find".length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - - it('handles word-break boundary shifts at both ends during forward context-window slide', () => { - const baseTexts = [ - // With one preceding adjacent non-whitespace char in view, it's part of - // a word... and remove a wordbreak! - /*is"*/ "n't", " ", "orange", " ", "juice", " ", "tasty", "?", " ", "I", " ", "find" - ]; - - const baseTokenization = new ContextTokenization(baseTexts.map(t => toToken(t)), null); - - const targetTexts = [ - // Without any preceding adjacent char in view, we can only interpret - // the leading `'` as an opening single-quote. - /*isn*/ "'", "t", " ", "orange", " ", "juice", " ", "tasty", "?", " ", "I", " ", "find", " ", "" - ]; - const targetTokens = targetTexts.map((t) => ({text: t, isWhitespace: t == ' '})); - const inputTransformMap: Map = new Map(); - - inputTransformMap.set(1, { insert: ' ', deleteLeft: 0 }); - inputTransformMap.set(2, { insert: '', deleteLeft: 0 }); - - const tokenization = baseTokenization.transitionTo( - targetTokens, { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'substitute', input: 1, match: 0}, - {op: 'substitute', input: 2, match: 1}, - {op: 'substitute', input: 3, match: 2}, - {op: 'substitute', input: 4, match: 3}, - {op: 'substitute', input: 5, match: 4}, - {op: 'substitute', input: 6, match: 5}, - {op: 'substitute', input: 7, match: 6}, - {op: 'substitute', input: 8, match: 7}, - {op: 'match', input: 9, match: 8}, - {op: 'match', input: 10, match: 9}, - {op: 'match', input: 11, match: 10}, - {op: 'match', input: 12, match: 11}, - {op: 'insert', match: 12}, - {op: 'insert', match: 13} - ], - leadTokenShift: 1, // "'", - leadEditLength: 1, // "n't" / "t" - matchLength: baseTexts.length - 1, - tailEditLength: 0, - tailTokenShift: 2 // " ", "" - }, - plainModel, - [{ sample: inputTransformMap, p: 1}] - ); - - assert.isOk(tokenization); - assert.equal(tokenization.tokens.length, targetTokens.length); - assert.equal(tokenization.tail.exampleInput, ""); - assert.equal(tokenization.tail.searchSpace.inputSequence.length, "".length); - assert.sameOrderedMembers(tokenization.tokens.map(t => t.exampleInput), targetTexts); - }); - }); - describe('evaluateTransition', () => { const testEdgeWindowSpec = { minTokens: 3, diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts index 50958ea756c..0b2a58da47b 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts @@ -357,7 +357,7 @@ describe('determineContextTransition', () => { const appendDeletingTransition = determineContextTransition( tracker, - extendingTransition.final, { + extensionDeletingTransition.final, { left: 'this is for testing ', startOfBuffer: true, endOfBuffer: true, @@ -431,7 +431,7 @@ describe('determineContextTransition', () => { const appendDeletingTransition = determineContextTransition( tracker, - extendingTransition.final, { + extensionDeletingTransition.final, { left: 'this is for testing ', startOfBuffer: true, endOfBuffer: true, @@ -443,7 +443,7 @@ describe('determineContextTransition', () => { const editingBkspTransition = determineContextTransition( tracker, - extendingTransition.final, { + appendDeletingTransition.final, { left: 'this is for testing', startOfBuffer: true, endOfBuffer: true,