diff --git a/web/src/engine/common/web-utils/src/priority-queue.ts b/web/src/engine/common/web-utils/src/priority-queue.ts index 66867f37ad5..bbbb627291f 100644 --- a/web/src/engine/common/web-utils/src/priority-queue.ts +++ b/web/src/engine/common/web-utils/src/priority-queue.ts @@ -29,8 +29,8 @@ export default class PriorityQueue { * the first parameter should precede the second parameter. * @param initialEntries */ - constructor(comparator: QueueComparator, initialEntries?: Type[]); - constructor(arg1: QueueComparator | PriorityQueue, initialEntries?: Type[]) { + constructor(comparator: QueueComparator, initialEntries?: ReadonlyArray); + constructor(arg1: QueueComparator | PriorityQueue, initialEntries?: ReadonlyArray) { if(typeof arg1 != 'function') { this.comparator = arg1.comparator; // Shallow-copies are fine. @@ -151,7 +151,7 @@ export default class PriorityQueue { * - O(`elements.count` * log(`heap.count`)) - logarithmic when elements.count << heap.count * @param elements A group of elements to enqueue simultaneously. */ - enqueueAll(elements: Type[]) { + enqueueAll(elements: ReadonlyArray) { if(elements.length == 0) { return; } @@ -227,7 +227,7 @@ export default class PriorityQueue { * This function makes no guarantees on the ordering of the returned elements; * they will almost certainly be unsorted. */ - toArray(): Type[] { - return this.heap.slice(0); + toArray(): ReadonlyArray { + return this.heap; } } \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index 575bd697530..0b1798298e6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -268,7 +268,7 @@ export class ContextState { const state = new ContextState(applyTransform(trueInput, context), lexicalModel); state.tokenization = new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke); state.appliedInput = transformDistribution?.[0].sample; - transition.finalize(state, transformDistribution, resultTokenization.taillessTrueKeystroke); + transition.finalize(state, transformDistribution); transition.revertableTransitionId = appliedSuggestionTransitionId; return transition; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index aaa133f7640..b414b894442 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -7,11 +7,14 @@ * in the context and associated correction-search progress and results. */ -import { applyTransform, buildMergedTransform } from "@keymanapp/models-templates"; +import { buildMergedTransform } from "@keymanapp/models-templates"; +// import { applyTransform, buildMergedTransform } from "@keymanapp/models-templates"; import { LexicalModelTypes } from '@keymanapp/common-types'; import { deepCopy, KMWString } from "@keymanapp/web-utils"; +import { SearchCluster } from "./search-cluster.js"; import { SearchPath } from "./search-path.js"; +import { SearchSpace } from './search-space.js'; import { TokenSplitMap } from "./context-tokenization.js"; import Distribution = LexicalModelTypes.Distribution; @@ -58,10 +61,10 @@ export class ContextToken { * Contains all relevant correction-search data for use in generating * corrections for this ContextToken instance. */ - public get searchPath(): SearchPath { - return this._searchPath; + public get searchSpace(): SearchSpace { + return this._searchSpace; } - private _searchPath: SearchPath; + private _searchSpace: SearchSpace; isPartial: boolean; @@ -100,21 +103,14 @@ export class ContextToken { constructor(param: ContextToken | LexicalModel, rawText?: string, isPartial?: boolean) { if(param instanceof ContextToken) { const priorToken = param; - this.isWhitespace = priorToken.isWhitespace; - this.isPartial = priorToken.isPartial; + Object.assign(this, priorToken); // We need to construct a separate search space from other token copies. // // In case we are unable to perfectly track context (say, due to multitaps) // we need to ensure that only fully-utilized keystrokes are considered. - this._searchPath = priorToken.searchPath; + this._searchSpace = priorToken._searchSpace; this._inputRange = priorToken._inputRange.slice(); - - // Preserve any annotated applied-suggestion transition ID data; it's useful - // for delayed reversion operations. - if(priorToken.appliedTransitionId !== undefined) { - this.appliedTransitionId = priorToken.appliedTransitionId - } } else { const model = param; @@ -139,11 +135,11 @@ export class ContextToken { bestProbFromSet: 1 }); const priorSpace = searchSpace; - searchSpace = searchSpace.addInput(entry, 1); + searchSpace = new SearchPath(searchSpace, entry, 1); priorSpace.stopTrackingResults(); }); - this._searchPath = searchSpace; + this._searchSpace = searchSpace; } } @@ -153,8 +149,8 @@ export class ContextToken { */ addInput(inputSource: TokenInputSource, distribution: Distribution) { this._inputRange.push(inputSource); - const priorSpace = this._searchPath; - this._searchPath = this._searchPath.addInput(distribution, inputSource.bestProbFromSet); + const priorSpace = this._searchSpace; + this._searchSpace = new SearchCluster([new SearchPath(this._searchSpace, distribution, inputSource.bestProbFromSet)]); priorSpace.stopTrackingResults(); } @@ -173,6 +169,14 @@ export class ContextToken { return this.exampleInput == ''; } + /** + * Gets the unique identifier that may be used to match this ContextToken with + * a correction-search result. + */ + get spaceId(): number { + return this.searchSpace.spaceId; + } + /** * Gets a compact string-based representation of `inputRange` that * maps compatible token source ranges to each other. @@ -208,18 +212,7 @@ export class ContextToken { * received that can correspond to the current instance. */ get exampleInput(): string { - /* - * TODO: with clear limits (strict cost minimization?) / prior calculation - * attempts, return the best _suggestion_ for this token. This is - * especially relevant for epic/dict-breaker - we want to best model the token - * as it would apply within the word-breaking algorithm. - * - * If not possible, find the best of the deepest search paths and append the - * most likely keystroke data afterward. - */ - const transforms = this.searchPath.inputSequence.map((dist) => dist[0].sample) - const composite = transforms.reduce((accum, current) => buildMergedTransform(accum, current), {insert: '', deleteLeft: 0}); - return composite.insert; + return this.searchSpace.bestExample.text; } /** @@ -232,57 +225,58 @@ export class ContextToken { static merge(tokensToMerge: ContextToken[], lexicalModel: LexicalModel): ContextToken { // Assumption: if we're merging a token, it's not whitespace. // Thus, we don't set the .isWhitespace flag field. - const resultToken = new ContextToken(lexicalModel); - - let lastSourceInput: TokenInputSource; - let lastInputDistrib: Distribution; - for(const token of tokensToMerge) { - const inputCount = token.inputRange.length; - let startIndex = 0; - - if(token.inputRange.length == 0) { - continue; - } - - // Are we re-merging on a previously split transform? - if(lastSourceInput?.trueTransform != token.inputRange[0].trueTransform) { - if(lastSourceInput) { - resultToken.addInput(lastSourceInput, lastInputDistrib); - } // else: there's nothing to add as input - } else { - // If so, re-merge it! - startIndex++; - - lastInputDistrib = lastInputDistrib?.map((entry, index) => { - return { - sample: buildMergedTransform(entry.sample, token.searchPath.inputSequence[0][index].sample), - p: entry.p - } - }); - - // In case there's only one input that needs merging on both ends. - if(inputCount == 1) { - // There's potential that the next incoming token needs to merge with this. - continue; - } else { - resultToken.addInput(lastSourceInput, lastInputDistrib); - } - } - lastSourceInput = null; - lastInputDistrib = null; - - // Ignore the last entry for now - it may need to merge with a matching - // entry in the next token! - for(let i = startIndex; i < inputCount - 1; i++) { - resultToken.addInput(token.inputRange[i], token.searchPath.inputSequence[i]); - } - lastSourceInput = token.inputRange[inputCount-1]; - lastInputDistrib = token.searchPath.inputSequence[inputCount-1]; - } - - resultToken.addInput(lastSourceInput, lastInputDistrib); - - return resultToken; + throw new Error("Temporarily unimplemented"); + // const resultToken = new ContextToken(lexicalModel); + + // let lastSourceInput: TokenInputSource; + // let lastInputDistrib: Distribution; + // for(const token of tokensToMerge) { + // const inputCount = token.inputRange.length; + // let startIndex = 0; + + // if(token.inputRange.length == 0) { + // continue; + // } + + // // Are we re-merging on a previously split transform? + // if(lastSourceInput?.trueTransform != token.inputRange[0].trueTransform) { + // if(lastSourceInput) { + // resultToken.addInput(lastSourceInput, lastInputDistrib); + // } // else: there's nothing to add as input + // } else { + // // If so, re-merge it! + // startIndex++; + + // lastInputDistrib = lastInputDistrib?.map((entry, index) => { + // return { + // sample: buildMergedTransform(entry.sample, token.searchSpace.inputSequence[0][index].sample), + // p: entry.p + // } + // }); + + // // In case there's only one input that needs merging on both ends. + // if(inputCount == 1) { + // // There's potential that the next incoming token needs to merge with this. + // continue; + // } else { + // resultToken.addInput(lastSourceInput, lastInputDistrib); + // } + // } + // lastSourceInput = null; + // lastInputDistrib = null; + + // // Ignore the last entry for now - it may need to merge with a matching + // // entry in the next token! + // for(let i = startIndex; i < inputCount - 1; i++) { + // resultToken.addInput(token.inputRange[i], token.searchSpace.inputSequence[i]); + // } + // lastSourceInput = token.inputRange[inputCount-1]; + // lastInputDistrib = token.searchSpace.inputSequence[inputCount-1]; + // } + + // resultToken.addInput(lastSourceInput, lastInputDistrib); + + // return resultToken; } /** @@ -291,101 +285,107 @@ export class ContextToken { * @param lexicalModel * @returns */ - split(split: TokenSplitMap, lexicalModel: LexicalModel) { + split(split: TokenSplitMap, lexicalModel: LexicalModel): ContextToken[] { // Assumption: if we're splitting a token, it's not whitespace - and // neither are the spun-off tokens. Thus, we don't set the .isWhitespace // flag field. - const tokensFromSplit: ContextToken[] = []; - - // Build an alternate version of the transforms: if we preprocess all deleteLefts, - // what text remains from each? - const alteredSources = preprocessInputSources(this.inputRange); - - const blankContext = { left: '', startOfBuffer: true, endOfBuffer: true }; - const splitSpecs = split.matches.slice(); - let currentText = {...blankContext}; - let lenBeforeLastApply = 0; - let committedLen = 0; - let constructingToken = new ContextToken(lexicalModel); - let backupToken: ContextToken; - let transformIndex = 0; - while(splitSpecs.length > 0) { - const splitMatch = splitSpecs[0]; - - if(splitMatch.text == currentText.left) { - tokensFromSplit.push(constructingToken); - constructingToken = new ContextToken(lexicalModel); - backupToken = null; - committedLen += lenBeforeLastApply; - currentText = {...blankContext}; - splitSpecs.shift(); - continue; - } else if(currentText.left.indexOf(splitMatch.text) > -1) { - // Oh dear - we've overshot the target! The split is awkward, in the - // middle of a keystroke. - - // Restore! - const overextendedToken = constructingToken; - constructingToken = backupToken; - - // We know how much of the next transform to pull in: it's specified on - // the split object. Excess on constructed token - the split 'text offset' - const totalLenBeforeLastApply = committedLen + lenBeforeLastApply; - // We read the start position for the NEXT token to know the split position. - const extraCharsAdded = splitSpecs[1].textOffset - totalLenBeforeLastApply; - const tokenSequence = overextendedToken.searchPath.inputSequence; - const lastInputIndex = tokenSequence.length - 1; - const inputDistribution = tokenSequence[lastInputIndex]; - const headDistribution = inputDistribution.map((m) => { - return { - sample: { - ...m.sample, - insert: KMWString.substring(m.sample.insert, 0, extraCharsAdded), - deleteRight: 0 - }, p: m.p - }; - }); - const tailDistribution = inputDistribution.map((m) => { - return { - sample: { - ...m.sample, - insert: KMWString.substring(m.sample.insert, extraCharsAdded), - deleteLeft: 0 - }, p: m.p - }; - }); - - const priorSourceInput = overextendedToken.inputRange[lastInputIndex]; - constructingToken.addInput(priorSourceInput, headDistribution); - tokensFromSplit.push(constructingToken); - - constructingToken = new ContextToken(lexicalModel); - backupToken = new ContextToken(constructingToken); - constructingToken.addInput({ - trueTransform: priorSourceInput.trueTransform, - inputStartIndex: priorSourceInput.inputStartIndex + extraCharsAdded, - bestProbFromSet: priorSourceInput.bestProbFromSet - }, tailDistribution); - - const lenToCommit = lenBeforeLastApply + extraCharsAdded; - splitSpecs.shift(); - - committedLen += lenToCommit; - currentText.left = KMWString.substring(currentText.left, lenToCommit); - lenBeforeLastApply = 0; - continue; // without incrementing transformIndex - we haven't processed a new one! - } else if(transformIndex == alteredSources.length) { - throw new Error("Invalid split specified!"); - } - - backupToken = new ContextToken(constructingToken); - lenBeforeLastApply = KMWString.length(currentText.left); - currentText = applyTransform(alteredSources[transformIndex].trueTransform, currentText); - constructingToken.addInput(this.inputRange[transformIndex], this.searchPath.inputSequence[transformIndex]); - transformIndex++; - } - - return tokensFromSplit; + // + // Proper splitting with multi-tokenization: may yield multiple variants of + // the requested token count, all of which could be seen as valid. + // + // Depends on how the SearchSpace splits. + throw new Error("Temporarily unimplemented"); + // const tokensFromSplit: ContextToken[] = []; + + // // Build an alternate version of the transforms: if we preprocess all deleteLefts, + // // what text remains from each? + // const alteredSources = preprocessInputSources(this.inputRange); + + // const blankContext = { left: '', startOfBuffer: true, endOfBuffer: true }; + // const splitSpecs = split.matches.slice(); + // let currentText = {...blankContext}; + // let lenBeforeLastApply = 0; + // let committedLen = 0; + // let constructingToken = new ContextToken(lexicalModel); + // let backupToken: ContextToken; + // let transformIndex = 0; + // while(splitSpecs.length > 0) { + // const splitMatch = splitSpecs[0]; + + // if(splitMatch.text == currentText.left) { + // tokensFromSplit.push(constructingToken); + // constructingToken = new ContextToken(lexicalModel); + // backupToken = null; + // committedLen += lenBeforeLastApply; + // currentText = {...blankContext}; + // splitSpecs.shift(); + // continue; + // } else if(currentText.left.indexOf(splitMatch.text) > -1) { + // // Oh dear - we've overshot the target! The split is awkward, in the + // // middle of a keystroke. + + // // Restore! + // const overextendedToken = constructingToken; + // constructingToken = backupToken; + + // // We know how much of the next transform to pull in: it's specified on + // // the split object. Excess on constructed token - the split 'text offset' + // const totalLenBeforeLastApply = committedLen + lenBeforeLastApply; + // // We read the start position for the NEXT token to know the split position. + // const extraCharsAdded = splitSpecs[1].textOffset - totalLenBeforeLastApply; + // const tokenSequence = overextendedToken.searchSpace.inputSequence; + // const lastInputIndex = tokenSequence.length - 1; + // const inputDistribution = tokenSequence[lastInputIndex]; + // const headDistribution = inputDistribution.map((m) => { + // return { + // sample: { + // ...m.sample, + // insert: KMWString.substring(m.sample.insert, 0, extraCharsAdded), + // deleteRight: 0 + // }, p: m.p + // }; + // }); + // const tailDistribution = inputDistribution.map((m) => { + // return { + // sample: { + // ...m.sample, + // insert: KMWString.substring(m.sample.insert, extraCharsAdded), + // deleteLeft: 0 + // }, p: m.p + // }; + // }); + + // const priorSourceInput = overextendedToken.inputRange[lastInputIndex]; + // constructingToken.addInput(priorSourceInput, headDistribution); + // tokensFromSplit.push(constructingToken); + + // constructingToken = new ContextToken(lexicalModel); + // backupToken = new ContextToken(constructingToken); + // constructingToken.addInput({ + // trueTransform: priorSourceInput.trueTransform, + // inputStartIndex: priorSourceInput.inputStartIndex + extraCharsAdded, + // bestProbFromSet: priorSourceInput.bestProbFromSet + // }, tailDistribution); + + // const lenToCommit = lenBeforeLastApply + extraCharsAdded; + // splitSpecs.shift(); + + // committedLen += lenToCommit; + // currentText.left = KMWString.substring(currentText.left, lenToCommit); + // lenBeforeLastApply = 0; + // continue; // without incrementing transformIndex - we haven't processed a new one! + // } else if(transformIndex == alteredSources.length) { + // throw new Error("Invalid split specified!"); + // } + + // backupToken = new ContextToken(constructingToken); + // lenBeforeLastApply = KMWString.length(currentText.left); + // currentText = applyTransform(alteredSources[transformIndex].trueTransform, currentText); + // constructingToken.addInput(this.inputRange[transformIndex], this.searchSpace.inputSequence[transformIndex]); + // transformIndex++; + // } + + // return tokensFromSplit; } } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index e1055964a7f..b7de4e47860 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -118,6 +118,8 @@ export class ContextTokenization { * If the final token is new due to a newly-introduced wordboundary traversed * by the keystroke, this will generally be set to an empty transform that * 'finalizes' the previous tail token. + * + * (Refer to #12494 for an example case.) */ readonly taillessTrueKeystroke: Transform; @@ -149,6 +151,13 @@ export class ContextTokenization { return this.tokens[this.tokens.length - 1]; } + /** + * Returns the unique correction-search space identifier corresponding to this tokenization. + */ + get spaceId(): number { + return this.tail.spaceId; + } + /** * Returns plain-text strings representing the most probable representation for all * tokens represented by this tokenization instance. @@ -535,6 +544,9 @@ export class ContextTokenization { if(splits[0]?.input.index == i) { // do a split! const split = splits.shift(); + // Proper splitting with multi-tokenization: may yield multiple + // variants of the requested token count, all of which could be seen as + // valid. const splitResults = baseTokenization[i].split(split, lexicalModel); const resultStack = splitResults.reverse(); while(resultStack.length > 0) { diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts index daf99a37197..3d564a33706 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts @@ -47,21 +47,6 @@ export class ContextTransition { // The transform ID in play. private _transitionId?: number; - /** - * Indicates the portion of the incoming keystroke data, if any, that applies to - * tokens before the last pre-caret token and thus should not be replaced by predictions - * based upon `state`. If the provided context state + the incoming transform do not - * adequately match the current context, the match attempt will fail with a `null` result. - * - * Should generally be non-null if the token before the caret did not previously exist. - * - * The result may be null if it does not match the prior context state or if bookkeeping - * based upon it is problematic - say, if wordbreaking effects shift due to new input, - * causing a mismatch with the prior state's tokenization. - * (Refer to #12494 for an example case.) - */ - preservationTransform?: Transform; - /** * When set, indicates that the text insertion point has returned to the endpoint of a * token last edited by application of a Suggestion. This is not set immediately after @@ -133,13 +118,12 @@ export class ContextTransition { * @param preservationTransform Portions of the most likely input that do not contribute to the final token * in the final context's tokenization. */ - finalize(state: ContextState, inputDistribution: Distribution, preservationTransform?: Transform) { + finalize(state: ContextState, inputDistribution: Distribution) { this._final = state; this.inputDistribution = inputDistribution; // Long-term, this should never be null... but we need to allow it at this point // in the refactoring process. this._transitionId = inputDistribution?.find((entry) => entry.sample.id !== undefined)?.sample.id; - this.preservationTransform = preservationTransform; } /** diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts index bdd0dba79ba..ed95dac6509 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts @@ -5,7 +5,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { ClassicalDistanceCalculation } from './classical-calculation.js'; import { ExecutionTimer, STANDARD_TIME_BETWEEN_DEFERS } from './execution-timer.js'; -import { PathResult, QUEUE_NODE_COMPARATOR, SearchPath } from './search-path.js'; +import { SearchSpace } from './search-space.js'; import { subsetByChar, subsetByInterval, mergeSubset, TransformSubset } from '../transform-subsets.js'; import Distribution = LexicalModelTypes.Distribution; @@ -21,6 +21,8 @@ type RealizedInput = ProbabilityMass[]; // NOT Distribution - they'r export const EDIT_DISTANCE_COST_SCALE = 5; export const MIN_KEYSTROKE_PROBABILITY = 0.0001; +export const DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL = 33; // in milliseconds. + export type TraversableToken = { key: TUnit, traversal: LexiconTraversal @@ -126,13 +128,20 @@ export class SearchNode { */ private _inputCost?: number; - constructor(rootTraversal: LexiconTraversal, toKey?: (arg0: string) => string); - constructor(node: SearchNode); - constructor(rootTraversal: LexiconTraversal | SearchNode, toKey?: (arg0: string) => string) { - toKey = toKey || (x => x); + /** + * A unique identifier corresponding to the earliest SearchPath containing + * the correction-search graph edge represented by this instance. + * + * Corresponding search results will be tagged with this, which can be used + * to identify the result's original source tokenization. + */ + readonly spaceId: number; - if(rootTraversal instanceof SearchNode) { - const priorNode = rootTraversal; + constructor(rootTraversal: LexiconTraversal, spaceId: number, toKey?: (arg0: string) => string); + constructor(node: SearchNode, spaceId?: number); + constructor(param1: LexiconTraversal | SearchNode, spaceId?: number, toKey?: ((arg0: string) => string)) { + if(param1 instanceof SearchNode) { + const priorNode = param1; Object.assign(this, priorNode); if(this.partialEdge) { @@ -144,11 +153,16 @@ export class SearchNode { // Do NOT copy over _inputCost; this is a helper-constructor for methods // building new nodes... which will have a different cost. delete this._inputCost; + + // This is unique at each level, though it will reuse a previous ID if no new + // one is provided (say, for 'insert' edits). + this.spaceId = spaceId ?? priorNode.spaceId; } else { this.calculation = new ClassicalDistanceCalculation(); - this.matchedTraversals = [rootTraversal]; + this.matchedTraversals = [param1]; this.priorInput = []; - this.toKey = toKey; + this.toKey = toKey || (x => x); + this.spaceId = spaceId; } } @@ -422,17 +436,17 @@ export class SearchNode { * represented lexicon prefix - be it due to not adding one (deletions) or * due to not being the same character, all mismatching cases are merged into * one, reducing the rate of expansion for the search graph. - * @param inputDistribution + * @param input * @param isSubstitution * @returns */ - private setupSubsetProcessing(inputDistribution: Distribution, isSubstitution: boolean ) { + private setupSubsetProcessing(dist: Distribution, isSubstitution: boolean, edgeId: number) { if(this.hasPartialInput) { throw new Error("Invalid state: will not take new input while still processing Transform subset"); } const edges: SearchNode[] = []; - const subsets = subsetByInterval(inputDistribution); + const subsets = subsetByInterval(dist); for(let dl = 0; dl < subsets.length; dl++) { const dlSubset = subsets[dl]; @@ -450,7 +464,7 @@ export class SearchNode { continue; } - const node = new SearchNode(this); + const node = new SearchNode(this, edgeId); node.calculation = edgeCalc; node.partialEdge = { doSubsetMatching: isSubstitution, @@ -479,8 +493,8 @@ export class SearchNode { * @returns An array of SearchNodes corresponding to search paths that skip the next * input keystroke. */ - buildDeletionEdges(inputDistribution: Distribution): SearchNode[] { - return this.setupSubsetProcessing(inputDistribution, false); + buildDeletionEdges(dist: Distribution, edgeId: number): SearchNode[] { + return this.setupSubsetProcessing(dist, false, edgeId); } /** @@ -492,12 +506,12 @@ export class SearchNode { * @returns An array of SearchNodes corresponding to search paths that match or * replace the next currently-unprocessed input. */ - buildSubstitutionEdges(inputDistribution: Distribution): SearchNode[] { + buildSubstitutionEdges(dist: Distribution, edgeId: number): SearchNode[] { // Note: due to the batching approach used via TransformSubsets, // substitutions are _not_ adequately represented by one 'insertion' + one // 'deletion' step. Explicit substitution / match-oriented processing is // required. - return this.setupSubsetProcessing(inputDistribution, true); + return this.setupSubsetProcessing(dist, true, edgeId); } /** @@ -547,22 +561,25 @@ export class SearchNode { } export class SearchResult { - private resultNode: SearchNode; + readonly node: SearchNode; + // Supports SearchPath -> SearchSpace remapping. + readonly spaceId: number; - constructor(node: SearchNode) { - this.resultNode = node; + constructor(node: SearchNode, spaceId?: number) { + this.node = node; + this.spaceId = spaceId ?? node.spaceId; } get inputSequence(): ProbabilityMass[] { - return this.resultNode.priorInput; + return this.node.priorInput; } get matchSequence(): TraversableToken[] { - return this.resultNode.calculation.matchSequence.map((char, i) => ({key: char, traversal: this.resultNode.matchedTraversals[i+1]})); + return this.node.calculation.matchSequence.map((char, i) => ({key: char, traversal: this.node.matchedTraversals[i+1]})); }; get matchString(): string { - return this.resultNode.resultKey; + return this.node.resultKey; } /** @@ -573,7 +590,7 @@ export class SearchResult { * `totalCost`.) */ get knownCost(): number { - return this.resultNode.editCount; + return this.node.editCount; } /** @@ -581,7 +598,7 @@ export class SearchResult { * negative log-likelihood of the input path taken to reach the node. */ get inputSamplingCost(): number { - return this.resultNode.inputSamplingCost; + return this.node.inputSamplingCost; } /** @@ -591,59 +608,49 @@ export class SearchResult { * to the resulting output. */ get totalCost(): number { - return this.resultNode.currentCost; + return this.node.currentCost; } get finalTraversal(): LexiconTraversal { - return this.resultNode.currentTraversal; + return this.node.currentTraversal; } } -// The set of search spaces corresponding to the same 'context' for search. -// Whenever a wordbreak boundary is crossed, a new instance should be made. -// Current best guesstimate of how compositor will retrieve ideal corrections. -export async function *getBestMatches(searchSpace: SearchPath, timer: ExecutionTimer): AsyncGenerator { - let currentReturns: {[resultKey: string]: SearchNode} = {}; - - // Stage 1 - if we already have extracted results, build a queue just for them and iterate over it first. - const returnedValues = Object.values(searchSpace.returnedValues); - if(returnedValues.length > 0) { - let preprocessedQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, returnedValues); - - while(preprocessedQueue.count > 0) { - const entryFromCache = timer.time(() => { - let entry = preprocessedQueue.dequeue(); - - // Is the entry a reasonable result? - if(entry.isFullReplacement) { - // If the entry's 'match' fully replaces the input string, we consider it - // unreasonable and ignore it. - return null; - } +/** + * Searches for the best available corrections from among the provided + * SearchSpaces, ending after the configured timer has elapsed or all available + * corrections have been enumerated. + * @param searchSpaces + * @param timer + * @returns + */ +export async function *getBestMatches(searchSpaces: SearchSpace[], timer: ExecutionTimer): AsyncGenerator { + const spaceQueue = new PriorityQueue((a, b) => a.currentCost - b.currentCost); - currentReturns[entry.resultKey] = entry; - // Do not track yielded time. - return new SearchResult(entry); - }, TimedTaskTypes.CACHED_RESULT); + // Stage 1 - if we already have extracted results, build a queue just for them + // and iterate over it first. + // + // Does not get any results that another iterator pulls up after this is + // created - and those results won't come up later in stage 2, either. Only + // intended for restarting a search, not searching twice in parallel. + const priorResultsQueue = new PriorityQueue((a, b) => a.totalCost - b.totalCost); + priorResultsQueue.enqueueAll(searchSpaces.map((space) => space.previousResults).flat()); - if(entryFromCache) { - // Time yielded here is generally spent on turning corrections into predictions. - // It's timing a different sort of task, so... different task set ID. - const timeSpan = timer.start(TimedTaskTypes.PREDICTING); - yield entryFromCache; - timeSpan.end(); + // With potential prior results re-queued, NOW enqueue. (Not before - the heap may reheapify!) + spaceQueue.enqueueAll(searchSpaces); - if(timer.timeSinceLastDefer > STANDARD_TIME_BETWEEN_DEFERS) { - await timer.defer(); - } - } - } - } + let currentReturns: {[resultKey: string]: SearchNode} = {}; // Stage 2: the fun part; actually searching! do { const entry = timer.time(() => { - let newResult: PathResult = searchSpace.handleNextNode(); + if((priorResultsQueue.peek()?.totalCost ?? Number.POSITIVE_INFINITY) <= spaceQueue.peek().currentCost) { + return priorResultsQueue.dequeue(); + } + + let bestQueue = spaceQueue.dequeue(); + const newResult = bestQueue.handleNextNode(); + spaceQueue.enqueue(bestQueue); if(newResult.type == 'none') { return null; @@ -667,7 +674,7 @@ export async function *getBestMatches(searchSpace: SearchPath, timer: ExecutionT if((currentReturns[node.resultKey]?.currentCost ?? Number.MAX_VALUE) > node.currentCost) { currentReturns[node.resultKey] = node; // Do not track yielded time. - return new SearchResult(node); + return new SearchResult(node, newResult.spaceId); } } @@ -683,7 +690,7 @@ export async function *getBestMatches(searchSpace: SearchPath, timer: ExecutionT if(timer.timeSinceLastDefer > STANDARD_TIME_BETWEEN_DEFERS) { await timer.defer(); } - } while(!timer.elapsed && searchSpace.currentCost < Number.POSITIVE_INFINITY); + } while(!timer.elapsed && spaceQueue.peek().currentCost < Number.POSITIVE_INFINITY); return null; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts new file mode 100644 index 00000000000..b490f01be79 --- /dev/null +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-cluster.ts @@ -0,0 +1,360 @@ +/* + * Keyman is copyright (C) SIL Global. MIT License. + * + * Created by jahorton on 2025-10-20 + * + * This file defines the predictive-text engine's SearchSpace class, which is used to + * manage the search-space(s) for text corrections within the engine. + */ + +import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils'; +import { LexicalModelTypes } from '@keymanapp/common-types'; + +import { SearchNode, SearchResult } from './distance-modeler.js'; +import { generateSpaceSeed, PathResult, SearchSpace } from './search-space.js'; +import { SearchPath } from './search-path.js'; + +import Distribution = LexicalModelTypes.Distribution; +import LexicalModel = LexicalModelTypes.LexicalModel; +import Transform = LexicalModelTypes.Transform; + +const PATH_QUEUE_COMPARATOR: Comparator = (a, b) => { + return a.currentCost - b.currentCost; +} + +// The set of search spaces corresponding to the same 'context' for search. +// Whenever a wordbreak boundary is crossed, a new instance should be made. +export class SearchCluster implements SearchSpace { + private selectionQueue: PriorityQueue = new PriorityQueue(PATH_QUEUE_COMPARATOR); + readonly spaceId: number; + + // We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time + // is cheaper than iteratively building a priority queue. + /** + * This tracks all paths that have reached the end of a viable input-matching path - even + * those of lower cost that produce the same correction as other paths. + * + * When new input is received, its entries are then used to append edges to the path in order + * to find potential paths to reach a new viable end. + */ + private completedPaths?: SearchNode[] = []; + + /** + * Acts as a Map that prevents duplicating a correction-search path if reached + * more than once. + */ + protected get processedEdgeSet(): {[pathKey: string]: boolean} { + return this._processedEdgeSet; + } + + private _processedEdgeSet?: {[pathKey: string]: boolean} = {}; + + /** + * Provides a heuristic for the base cost at each depth if the best + * individual input were taken at that level. + */ + readonly lowestPossibleSingleCost: number; + + /** + * Constructs a fresh SearchSpace instance for used in predictive-text correction + * and suggestion searches. + * @param baseSpaceId + * @param model + */ + constructor(model: LexicalModel); + constructor(inboundPaths: ReadonlyArray); + constructor(arg2?: ReadonlyArray | LexicalModel) { + this.spaceId = generateSpaceSeed(); + + if(Array.isArray(arg2)) { + const inboundPaths = arg2 as ReadonlyArray; + this.lowestPossibleSingleCost = Math.min(...inboundPaths.map(p => p.lowestPossibleSingleCost)); + this.completedPaths = inboundPaths.flatMap(p => p.previousResults).map(r => r.node); + inboundPaths.forEach(p => p.stopTrackingResults()); + this.selectionQueue.enqueueAll(inboundPaths); + } else { + const model = arg2 as LexicalModel; + const rootPath = new SearchPath(model); + this.selectionQueue.enqueue(rootPath); + } + + return; + } + + /** + * Retrieves the sequence of inputs + */ + public get inputSequences(): Distribution[][] { + const paths = this.selectionQueue.toArray(); + return paths.flatMap((p) => p.inputSequences); + } + + public get inputCount(): number { + return this.selectionQueue.peek()?.inputCount ?? 0; + } + + public hasInputs(keystrokeDistributions: Distribution[]): boolean { + return !!this.parents.find((p) => p.hasInputs(keystrokeDistributions)); + } + + public get bestExample(): {text: string, p: number} { + const bestPrefixes = this.selectionQueue.toArray().map(p => p.bestExample); + return bestPrefixes.reduce((max, curr) => max.p < curr.p ? curr : max); + } + + public get parents(): SearchSpace[] { + return this.selectionQueue.toArray().slice(); + } + + increaseMaxEditDistance() { + // By extracting the entries from the priority queue and increasing distance outside of it as a batch job, + // we get an O(N) implementation, rather than the O(N log N) that would result from maintaining the original queue. + const entries = this.selectionQueue.toArray(); + + entries.forEach((path) => path.increaseMaxEditDistance()); + + // Since we just modified the stored instances, and the costs may have shifted, we need to re-heapify. + this.selectionQueue = new PriorityQueue(PATH_QUEUE_COMPARATOR, entries.slice()); + } + + /** + * When true, this indicates that the currently-represented portion of context + * has fat-finger data available, which itself indicates that the user has + * corrections enabled. + */ + get correctionsEnabled(): boolean { + const paths = this.selectionQueue.toArray(); + // When corrections are disabled, the Web engine will only provide individual Transforms + // for an input, not a distribution. No distributions means we shouldn't do corrections. + return !!paths.find(p => p.correctionsEnabled); + } + + /** + * Extends the correction-search process embodied by this SearchSpace by an extra + * input character, according to the characters' likelihood in the distribution. + * @param inputDistribution The fat-finger distribution for the incoming keystroke (or + * just the raw keystroke if corrections are disabled) + */ + addInput(inputDistribution: Distribution, bestProbFromSet: number): SearchPath { + return new SearchPath(this, inputDistribution, bestProbFromSet); + } + + public get currentCost(): number { + return this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; + } + + public get rootPath() { + return this.selectionQueue.peek().rootPath; + } + + /** + * Retrieves the lowest-cost / lowest-distance edge from the selection queue, + * checks its validity as a correction to the input text, and reports on what + * sort of result the edge's destination node represents. + * @returns + */ + public handleNextNode(): PathResult { + const bestPath = this.selectionQueue.dequeue(); + const currentResult = bestPath.handleNextNode(); + this.selectionQueue.enqueue(bestPath); + + if(currentResult.type == 'complete') { + this.completedPaths?.push(currentResult.finalNode); + currentResult.spaceId = this.spaceId; + } + + return currentResult; + } + + public get previousResults(): SearchResult[] { + return this.completedPaths?.map((n => new SearchResult(n, this.spaceId))) ?? []; + } + + public stopTrackingResults() { + delete this.completedPaths; + } + + public get codepointLength(): number { + return this.parents?.[0].codepointLength ?? 0; + } + + // public merge(space: SearchSpace): SearchSpace { + // // just... iterate through entries to construct an extended version of THIS + // // search-space. + // // - though... we aren't actually set up to... DO that, are we? + // // - ... .inputs? That might actually work! + // // - issue: merging previously split inputs / input paths. How to identify those? + // // - ** wait: they share the same transform ID on both sides! ** + // // - alternative thought: could demark a source path spaceID on split-off paths? + // // - or... maybe just a 'split ID'? + // // - so... that's enough, right? For our purposes? + // // - but split + split again are not technically impossible... + // // - and even then, isn't that actually overcomplicating things? + // // + // // Needs a new spaceID, of course - at each appended step, to be clear. + // // + // // + // // spaceID => SearchSpace, it would seem. Just have the token use the ID + // // from SearchSpace / SearchEdge... wait. SearchEdges with same ID... that's + // // only really available from SearchSpace. May need to brainstorm that a bit. + // // - current design WAS to do the combination on the State level. + // // ... what if SearchSpace re-maps the search path stateIDs to a combined stateID + // // that it emits? Then we don't need to worry about micromanaging search path + // // IDs given how they'll be constructed. + // } + + public split(charIndex: number, model: LexicalModel): [SearchSpace, SearchSpace][] { + return this._split(charIndex, model, new Map()); + } + + // splitCache: + // - key: id of original search path being split + // - value's index: the number of preserved codepoints + // - value's instance at the index: the spun-off search space + private _split( + charIndex: number, + model: LexicalModel, + splitCache: Map + ): [SearchCluster, SearchCluster][] { + if(this.codepointLength == charIndex) { + console.log('a'); + // If we're splitting at the tail end of an existing space, just re-use + // the space and pass along an empty one for the end. + return [[this, new SearchCluster(model)]]; + } + + // Ensure common split-ancestors still resolve to the same entity. + const componentPaths = this.selectionQueue.toArray(); + let baseResultSet: [SearchCluster, SearchCluster][] = []; + + const deduplicateSplitResults = (results: [SearchCluster, SearchCluster][]) => { + // Re-merge paths that converge to the same point. + const duplicateMap: Map = new Map(); + results.forEach(result => { + const headSpaceId = result[0].spaceId; + const arr: [SearchCluster, SearchCluster][] = duplicateMap.get(headSpaceId) ?? []; + arr.push(result); + duplicateMap.set(result[0].spaceId, arr); + }); + + const finalResults: [SearchCluster, SearchCluster][] = []; + for(const splits of duplicateMap.values()) { + const headSpace = splits[0][0]; + + // const uniqueTailSpaces = [...splits.reduce((set, curr) => { + // if(!set.has(curr[1].spaceId)) { + // set.set(curr[1].spaceId, curr[1]); + // } else { + // console.log('z'); + // } + + // return set; + // }, new Map()).values()]; + + const paths = splits.flatMap(split => split[1].selectionQueue.toArray()); + const tailSpace = new SearchCluster(paths); + // const resultPaths: [SearchSpace, SearchSpace][] = uniqueTailSpaces.map((tailSpace) => ([headSpace, tailSpace])); + + // resultPaths.forEach(entry => finalResults.push(entry)); + finalResults.push([headSpace, tailSpace]); + } + + return finalResults; + } + + const pathFiltering = componentPaths.reduce((filtering, path) => { + if(path.codepointLength - path.edgeLength > charIndex) { + filtering.inParent.push(path); + } else { + filtering.inCurrent.push(path); + } + + return filtering; + }, { inParent: [] as SearchPath[], inCurrent: [] as SearchPath[]}) + + // should filter all that meet the condition (and those that don't) + if(pathFiltering.inParent.length > 0) { + const parentResults = pathFiltering.inParent.flatMap((path) => { + console.log(`b - ${path.bestExample.text}`); + // TODO: resolve! + const results = (path.parents[0] as SearchCluster)._split(charIndex, model, splitCache); + + return results.map((results) => { + const tailSpace = new SearchCluster([results[1].addInput([...path.inputs], path.bestProbInEdge)]); + results[1] = tailSpace; + return results; + }); + }); + + baseResultSet = parentResults; + } + + // Re: space IDs - we can't reuse data for anything we're reconstructing + // after the split point. Original space IDs on the left-hand side may + // remain unaltered, but right-hand needs to be re-built from scratch, in + // new SearchPaths / SearchSpaces. + // + // We can optimize how many new spaces/paths we create for the right-hand + // side, though: each starting the same count in, at the same input-offset + // position, should be safe to amalgamate. + const pathResults: [SearchCluster, SearchCluster][] = pathFiltering.inCurrent.map((path) => { + const parentSpace = path.parents[0] ?? new SearchCluster(model); + const pathStartIndex = path.codepointLength - path.edgeLength; + if(path.codepointLength - path.edgeLength == charIndex) { + console.log(`c - ${path.bestExample.text}`); + // yay, great case! Splits cleanly on the boundary BEFORE this path, at + // its start. + // + // parentSpace is thus the END of the prior token. + // Start a new one with the current Path. + // return [parentSpace, new SearchSpace(/* new spaceId */, path /* reconstructed, now space ID */)]; + const newPath = new SearchCluster(model).addInput([...path.inputs], path.bestProbInEdge); + return [ + parentSpace instanceof SearchPath ? new SearchCluster([parentSpace]) : parentSpace, + new SearchCluster([newPath]) + ] as [SearchCluster, SearchCluster]; + } else { + console.log(`d - ${path.bestExample.text}`); + // OK, so we need to actually split this path in twain. + const pathCharIndex = charIndex - pathStartIndex; + const results = path.split(pathCharIndex, model); + console.log(`pathId: ${path.spaceId} - ${splitCache.has(path.spaceId) ? 'found' : 'not found'}`); + + const pathSplitCacheArray = splitCache.get(path.spaceId) ?? []; + splitCache.set(path.spaceId, pathSplitCacheArray); + + const newHeadSpace = pathSplitCacheArray[pathCharIndex]?.head ?? new SearchCluster([new SearchPath(parentSpace, [...results[0].inputs], path.bestProbInEdge)]); + const newTailSpace = pathSplitCacheArray[pathCharIndex]?.tail ?? new SearchCluster([new SearchCluster(model).addInput([...results[1].inputs], path.bestProbInEdge)]); + + pathSplitCacheArray[pathCharIndex] = { + head: newHeadSpace, + tail: newTailSpace + } + return [newHeadSpace, newTailSpace]; + } + }); + + baseResultSet = pathResults.concat(baseResultSet); + + // From pathResults: + // - LHS deduplicate: if same spaceIDs appear on left-hand side, they're the same space; + // we likely split at the same pointt + // - RHS: check search depth + offset position + // - order by input set likelihood + // - replace other path variants with that + // + // Finally, deduplicate the tuples as much as possible. + // ... wait. Why do we have multiplicity in the paths? We need to be able to reduce things + // down to just 1 + 1 split token, not multiple in each position. + // + // ... first stop: we could just... take the most likely case and ignore the others? + // ... in which case, why evaluate ALL paths? + // - b/c LHS matches could show up multiple times? + // + // Can we mitigate these cases with improved output from the wordbreaker(s) - say, + // about "ambiguous wordbreak" scenarios? + + console.log(`result count: ${baseResultSet.length}; results ${JSON.stringify(baseResultSet.map(r => ([r[0].bestExample.text, r[1].bestExample.text])))}`); + return deduplicateSplitResults(baseResultSet); + } +} \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts index f7e89eb1cba..db654b15f15 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts @@ -8,82 +8,63 @@ * engine. */ -import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils'; +import { QueueComparator as Comparator, KMWString, PriorityQueue } from '@keymanapp/web-utils'; import { LexicalModelTypes } from '@keymanapp/common-types'; import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js'; +import { SearchCluster } from './search-cluster.js'; +import { generateSpaceSeed, PathResult, SearchSpace } from './search-space.js'; import Distribution = LexicalModelTypes.Distribution; import LexicalModel = LexicalModelTypes.LexicalModel; import Transform = LexicalModelTypes.Transform; -export const DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL = 33; // in milliseconds. - export const QUEUE_NODE_COMPARATOR: Comparator = function(arg1, arg2) { return arg1.currentCost - arg2.currentCost; } -type NullPath = { - type: 'none' -} - -type IntermediateSearchPath = { - type: 'intermediate', - cost: number -} - -type CompleteSearchPath = { - type: 'complete', - cost: number, - finalNode: SearchNode -} - -export type PathResult = NullPath | IntermediateSearchPath | CompleteSearchPath; - // The set of search spaces corresponding to the same 'context' for search. // Whenever a wordbreak boundary is crossed, a new instance should be made. -export class SearchPath { +export class SearchPath implements SearchSpace { private selectionQueue: PriorityQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR); - private inputs: Distribution; + private _inputs?: Distribution; + + public get inputs(): Distribution { + return this._inputs; + } readonly rootPath: SearchPath; - private parentPath: SearchPath; + readonly bestProbInEdge: number; + + private parentSpace: SearchSpace; + readonly spaceId: number; // We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time // is cheaper than iteratively building a priority queue. - /** - * This tracks all paths that have reached the end of a viable input-matching path - even - * those of lower cost that produce the same correction as other paths. - * - * When new input is received, its entries are then used to append edges to the path in order - * to find potential paths to reach a new viable end. - */ - private completedPaths?: SearchNode[] = []; /** - * Marks all results that have already been returned since the last input was received. - * Is cleared after .addInput() calls. + * Marks all results that have already been returned from this instance of SearchPath. + * Should be deleted and cleared if any paths consider this one as a parent. */ - public returnedValues?: {[resultKey: string]: SearchNode} = {}; // TODO: make it private again! + private returnedValues?: {[resultKey: string]: SearchNode} = {}; /** * Acts as a Map that prevents duplicating a correction-search path if reached * more than once. */ protected get processedEdgeSet(): {[pathKey: string]: boolean} { - return this._processedEdgeSet; + return this.rootPath._processedEdgeSet; } private _processedEdgeSet?: {[pathKey: string]: boolean} = {}; /** - * Provides a heuristic for the base cost at each depth if the best - * individual input were taken at that level. + * Provides a heuristic for the base cost at this path's depth if the best + * individual input were taken here, regardless of whether or not that's possible. */ - private lowestCostAtDepth: number[]; + readonly lowestPossibleSingleCost: number; - constructor(space: SearchPath); /** * Constructs a fresh SearchPath instance for used in predictive-text correction * and suggestion searches. @@ -91,44 +72,140 @@ export class SearchPath { * @param model */ constructor(model: LexicalModel); - constructor(arg1: LexicalModel | SearchPath) { - if(arg1 instanceof SearchPath) { - const parentSpace = arg1; - this.lowestCostAtDepth = parentSpace.lowestCostAtDepth.slice(); + constructor(space: SearchSpace, inputs: Distribution, bestProbFromSet: number); + constructor(arg1: LexicalModel | SearchSpace, inputs?: Distribution, bestProbFromSet?: number) { + // If we're taking in a pre-constructed search node, it's got an associated, + // pre-assigned spaceID - so use that. + const isExtending = (arg1 instanceof SearchPath || arg1 instanceof SearchCluster); + this.spaceId = generateSpaceSeed(); + + if(isExtending) { + const parentSpace = arg1 as SearchSpace; + this.bestProbInEdge = bestProbFromSet; + const logTierCost = -Math.log(bestProbFromSet); + + this._inputs = inputs; + this.lowestPossibleSingleCost = parentSpace.lowestPossibleSingleCost + logTierCost; this.rootPath = parentSpace.rootPath; - this.parentPath = parentSpace; + this.parentSpace = parentSpace; - return; - } + this.addEdgesForNodes(parentSpace.previousResults.map(r => r.node)); - const model = arg1; - if(!model.traverseFromRoot) { - throw new Error("The provided model does not implement the `traverseFromRoot` function, which is needed to support robust correction searching."); + return; } - const rootNode = new SearchNode(model.traverseFromRoot(), model.toKey ? model.toKey.bind(model) : null); - this.selectionQueue.enqueue(rootNode); - this.lowestCostAtDepth = []; + const model = arg1 as LexicalModel; + this.selectionQueue.enqueue(new SearchNode(model.traverseFromRoot(), this.spaceId, t => model.toKey(t))); + this.lowestPossibleSingleCost = 0; this.rootPath = this; - - this.completedPaths = []; + this.bestProbInEdge = 1; } /** - * Retrieves the sequence of inputs + * Retrieves the sequences of inputs that led to this SearchPath. */ - public get inputSequence(): Distribution[] { - if(this.parentPath) { - return [...this.parentPath.inputSequence, this.inputs]; - } else if(this.inputs) { - return [this.inputs]; + public get inputSequences(): Distribution[][] { + const parentSequences = this.parentSpace?.inputSequences ?? []; + + if(parentSequences.length == 0) { + return this._inputs ? [[this._inputs]] : []; + } else { + return parentSequences.map(s => [...s, this._inputs]); + } + } + + public hasInputs(keystrokeDistributions: Distribution[]): boolean { + if(this.inputCount == 0) { + return keystrokeDistributions.length == 0; + } else if(keystrokeDistributions.length != this.inputCount) { + return false; + } + + const tailInput = [...keystrokeDistributions.pop()]; + const localInput = this.lastInput; + + // Actual reference match? Easy mode. + if(localInput == tailInput) { + return !!this.parents.find(p => p.hasInputs(keystrokeDistributions)); + } else if(localInput.length != tailInput.length) { + return false; } else { - return []; + for(let entry of tailInput) { + const matchIndex = localInput.findIndex((x) => { + const s1 = x.sample; + const s2 = entry.sample; + // Check for equal reference first before the other checks; it makes a nice shortcut. + if(x == entry) { + return true; + } if(x.p == entry.p && s1.deleteLeft == s2.deleteLeft + && s1.id == s2.id && ((s1.deleteRight ?? 0) == (s2.deleteRight ?? 0)) && s1.insert == s2.insert + ) { + return true; + } + return false; + }); + + if(matchIndex == -1) { + return false; + } else { + tailInput.splice(matchIndex, 1); + } + } + + return !!this.parents.find(p => p.hasInputs(keystrokeDistributions)); + } + } + + public get lastInput(): Distribution> { + // Shallow-copies the array to prevent external modification; the Transforms + // are marked Readonly to prevent their modification as well. + return [...this.inputs]; + } + + public get inputCount(): number { + if(!this.parentSpace) { + return 0; + } else { + return this.parentSpace.inputCount + 1; + } + } + + public get logTierCost(): number { + return -Math.log(this.bestProbInEdge); + } + + // TODO: track as a class property; avoid the need for repeated string calculations. + // Or just use the subset and its pre-known length/delete values in some manner. + public get edgeLength(): number { + const insert = this._inputs?.[0].sample.insert ?? ''; + return KMWString.length(insert); + } + + // TODO: consider optimizing this; we could certainly precompute these values + // rather than recalculating it each time. + public get codepointLength(): number { + const deleteLeft = this._inputs?.[0].sample.deleteLeft ?? 0; + const baseLength = this.parentSpace?.codepointLength ?? 0; + return baseLength + this.edgeLength - deleteLeft; + } + + public get bestExample(): {text: string, p: number} { + const bestPrefix = this.parentSpace?.bestExample ?? { text: '', p: 1 }; + const bestLocalInput = this._inputs?.reduce((max, curr) => max.p < curr.p ? curr : max) ?? { sample: { insert: '', deleteLeft: 0 }, p: 1}; + + return { + text: KMWString.substring(bestPrefix.text, 0, KMWString.length(bestPrefix.text) - bestLocalInput.sample.deleteLeft) + bestLocalInput.sample.insert, + p: bestPrefix.p * bestLocalInput.p } } + get parents() { + // The SearchPath class may only have a single parent. + return [this.parentSpace]; + } + increaseMaxEditDistance() { - this.parentPath.increaseMaxEditDistance(); + this.parentSpace.increaseMaxEditDistance(); // By extracting the entries from the priority queue and increasing distance outside of it as a batch job, // we get an O(N) implementation, rather than the O(N log N) that would result from maintaining the original queue. @@ -140,85 +217,73 @@ export class SearchPath { this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, entries); } + // ... maaaaybe only call if actually splitting? + // charIndex: index within this.edgeLength where the split may occur. + public split(charIndex: number, model: LexicalModel): [SearchPath, SearchPath] { + // ... might be calculated from the SearchSpace class? + if(charIndex < this.edgeLength) { + // TODO: split! + const firstSet: Distribution = this._inputs.map((input) => ({ + // keep insert head + // keep deleteLeft + sample: { + insert: KMWString.substring(input.sample.insert, 0, charIndex), + deleteLeft: input.sample.deleteLeft + }, p: input.p + })); + + const secondSet: Distribution = this._inputs.map((input) => ({ + // keep insert tail + // deleteLeft == 0 + sample: { + insert: KMWString.substring(input.sample.insert, charIndex), + deleteLeft: 0 + }, p: input.p + })); + + // construct two SearchPath instances based on the two sets! + return [ + new SearchPath(this.parentSpace, firstSet, this.logTierCost), + new SearchPath(new SearchPath(model), secondSet, this.logTierCost) + ]; + } else { + // this instance = 'first set' + // second instance: empty transforms. + // + // stopgap: maybe go ahead and check each input for any that are longer? + // won't matter shortly, though. + return [this, new SearchPath(model)]; + } + } + get correctionsEnabled(): boolean { // When corrections are disabled, the Web engine will only provide individual Transforms // for an input, not a distribution. No distributions means we shouldn't do corrections. - return this.parentPath?.correctionsEnabled || this.inputs?.length > 1; + return this.parentSpace?.correctionsEnabled || this.inputs?.length > 1; } - /** - * Extends the correction-search process embodied by this SearchPath by an extra - * input character, according to the characters' likelihood in the distribution. - * @param inputDistribution The fat-finger distribution for the incoming keystroke (or - * just the raw keystroke if corrections are disabled) - */ - addInput(inputDistribution: Distribution, bestProbFromSet: number): SearchPath { - const input = inputDistribution; - - const childSpace = new SearchPath(this); + public get currentCost(): number { + const parentCost = this.parentSpace?.currentCost ?? Number.POSITIVE_INFINITY; + const localCost = this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; - childSpace.inputs = inputDistribution; - const lastDepthCost = this.lowestCostAtDepth[this.lowestCostAtDepth.length - 1] ?? 0; - const logTierCost = -Math.log(bestProbFromSet); - childSpace.lowestCostAtDepth.push(lastDepthCost + logTierCost); + return Math.min(localCost, parentCost); + } + protected addEdgesForNodes(baseNodes: ReadonlyArray) { // With a newly-available input, we can extend new input-dependent paths from // our previously-reached 'extractedResults' nodes. - let newlyAvailableEdges: SearchNode[] = []; - let batches = this.completedPaths?.map(function(node) { - let deletions = node.buildDeletionEdges(input); - let substitutions = node.buildSubstitutionEdges(input); + let outboundNodes = baseNodes.map((node) => { + let deletions = node.buildDeletionEdges(this.inputs, this.spaceId); + let substitutions = node.buildSubstitutionEdges(this.inputs, this.spaceId); const batch = deletions.concat(substitutions); // Skip the queue for the first pass; there will ALWAYS be at least one pass, // and queue-enqueing does come with a cost. Avoid the unnecessary overhead. return batch.flatMap(e => e.processSubsetEdge()); - }); - - childSpace.completedPaths = []; - childSpace.returnedValues = {}; - - batches?.forEach(function(batch) { - newlyAvailableEdges = newlyAvailableEdges.concat(batch); - }); - - childSpace.selectionQueue.enqueueAll(newlyAvailableEdges); - - return childSpace; - } + }).flat(); - public get currentCost(): number { - const parentCost = this.parentPath?.currentCost ?? Number.POSITIVE_INFINITY; - const localCost = this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; - - return Math.min(localCost, parentCost); - } - - /** - * Given an incoming SearchNode, this method will build all outgoing edges - * from the node that correspond to processing this SearchPath instance's - * input distribution. - * @param currentNode - */ - private addEdgesForNodes(currentNode: SearchNode) { - // Hard restriction: no further edits will be supported. This helps keep the search - // more narrowly focused. - const substitutionsOnly = currentNode.editCount == 2; - - let deletionEdges: SearchNode[] = []; - if(!substitutionsOnly) { - deletionEdges = currentNode.buildDeletionEdges(this.inputs); - } - const substitutionEdges = currentNode.buildSubstitutionEdges(this.inputs); - let batch = deletionEdges.concat(substitutionEdges); - - // Skip the queue for the first pass; there will ALWAYS be at least one pass, - // and queue-enqueing does come with a cost - avoid unnecessary overhead here. - batch = batch.flatMap(e => e.processSubsetEdge()); - - this.selectionQueue.enqueueAll(batch); - // We didn't reach an end-node, so we just end the iteration and continue the search. + this.selectionQueue.enqueueAll(outboundNodes); } /** @@ -228,7 +293,7 @@ export class SearchPath { * @returns */ public handleNextNode(): PathResult { - const parentCost = this.parentPath?.currentCost ?? Number.POSITIVE_INFINITY; + const parentCost = this.parentSpace?.currentCost ?? Number.POSITIVE_INFINITY; const localCost = this.selectionQueue.peek()?.currentCost ?? Number.POSITIVE_INFINITY; if(parentCost <= localCost) { @@ -238,21 +303,22 @@ export class SearchPath { }; } - const result = this.parentPath.handleNextNode() as IntermediateSearchPath | CompleteSearchPath; + const result = this.parentSpace.handleNextNode(); - if(result.type == 'complete' && !this.processedEdgeSet[result.finalNode.pathKey]) { - this.addEdgesForNodes(result.finalNode); + if(result.type == 'complete') { + this.addEdgesForNodes([result.finalNode]); } return { ...result, type: 'intermediate' - } + } as PathResult } + // will have equal .spaceId. let currentNode = this.selectionQueue.dequeue(); - let unmatchedResult: IntermediateSearchPath = { + let unmatchedResult = { type: 'intermediate', cost: currentNode.currentCost } @@ -260,7 +326,7 @@ export class SearchPath { // Have we already processed a matching edge? If so, skip it. // We already know the previous edge is of lower cost. if(this.processedEdgeSet[currentNode.pathKey]) { - return unmatchedResult; + return unmatchedResult as PathResult; } else { this.processedEdgeSet[currentNode.pathKey] = true; } @@ -271,7 +337,7 @@ export class SearchPath { // Note: .knownCost is not scaled, while its contribution to .currentCost _is_ scaled. let substitutionsOnly = false; if(currentNode.editCount > 2) { - return unmatchedResult; + return unmatchedResult as PathResult; } else if(currentNode.editCount == 2) { substitutionsOnly = true; } @@ -280,11 +346,8 @@ export class SearchPath { // Allows a little 'wiggle room' + 2 "hard" edits. // Can be important if needed characters don't actually exist on the keyboard // ... or even just not the then-current layer of the keyboard. - // - // TODO: still consider the lowest-cost individual edges for THIS specific criterion. - const tierMinCost = this.lowestCostAtDepth[currentNode.priorInput.length-1]; - if(currentNode.currentCost > tierMinCost + 2.5 * EDIT_DISTANCE_COST_SCALE) { - return unmatchedResult; + if(currentNode.currentCost > this.lowestPossibleSingleCost + 2.5 * EDIT_DISTANCE_COST_SCALE) { + return unmatchedResult as PathResult; } // Stage 2: process subset further OR build remaining edges @@ -292,7 +355,7 @@ export class SearchPath { if(currentNode.hasPartialInput) { // Re-use the current queue; the number of total inputs considered still holds. this.selectionQueue.enqueueAll(currentNode.processSubsetEdge()); - return unmatchedResult; + return unmatchedResult as PathResult; } // OK, we fully crossed a graph edge and have landed on a transition point; @@ -304,28 +367,36 @@ export class SearchPath { this.selectionQueue.enqueueAll(insertionEdges); } - // It was the final tier - store the node for future reference. - this.completedPaths?.push(currentNode); + if(currentNode.spaceId == this.spaceId) { + if(this.returnedValues) { + if((this.returnedValues[currentNode.resultKey]?.currentCost ?? Number.POSITIVE_INFINITY) > currentNode.currentCost) { + this.returnedValues[currentNode.resultKey] = currentNode; + } else { + // Not a better cost, so reject it and move on to the next potential result. + return this.handleNextNode(); + } + } - if((this.returnedValues[currentNode.resultKey]?.currentCost ?? Number.POSITIVE_INFINITY) > currentNode.currentCost) { - this.returnedValues[currentNode.resultKey] = currentNode; - } else { - // Not a better cost, so reject it and move on to the next potential result. - return this.handleNextNode(); + return { + type: 'complete', + cost: currentNode.currentCost, + finalNode: currentNode, + spaceId: this.spaceId + }; } - return { - type: 'complete', - cost: currentNode.currentCost, - finalNode: currentNode - }; + // If we've somehow fully exhausted all search options, indicate that none remain. + return unmatchedResult as PathResult; } - public previousResults(): SearchResult[] { - return Object.values(this.returnedValues).map(v => new SearchResult(v)); + public get previousResults(): SearchResult[] { + return Object.values(this.returnedValues ?? {}).map(v => new SearchResult(v)); } + /** + * + */ public stopTrackingResults() { - delete this.completedPaths; + delete this.returnedValues; } } \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts new file mode 100644 index 00000000000..69c5e7143cc --- /dev/null +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts @@ -0,0 +1,154 @@ +/* + * Keyman is copyright (C) SIL Global. MIT License. + * + * Created by jahorton on 2025-10-09 + * + * This file the predictive-text engine's SearchSpace class, which is used to + * manage the search-space(s) for text corrections within the engine. + */ + +import { LexicalModelTypes } from "@keymanapp/common-types"; + +import { SearchNode, SearchResult } from "./distance-modeler.js"; +import { SearchPath } from "./search-path.js"; + +import Distribution = LexicalModelTypes.Distribution; +import Transform = LexicalModelTypes.Transform; + +export let SPACE_ID_SEED = 0; + +export function generateSpaceSeed(): number { + return SPACE_ID_SEED++; +} + +type NullPath = { + type: 'none' +} + +type IntermediateSearchPath = { + type: 'intermediate', + cost: number +} + +type CompleteSearchPath = { + type: 'complete', + cost: number, + finalNode: SearchNode, + spaceId: number +} + +export type PathResult = NullPath | IntermediateSearchPath | CompleteSearchPath; + +/** + * Represents all or a portion of the dynamically-generated graph used to search + * for predictive-text corrections. + */ +export interface SearchSpace { + /** + * Returns an identifier uniquely identifying this search-batching structure + * by correction-search results. + */ + readonly spaceId: number; + + // TODO: only truly needs to be the lookup set. + readonly rootPath: SearchPath; + + readonly parents: SearchSpace[]; + + /** + * Retrieves the lowest-cost / lowest-distance edge from the batcher's search + * area, checks its validity as a correction to the input text, and reports on + * what sort of result the edge's destination node represents. + * @returns + */ + handleNextNode(): PathResult; + + /** + * Denotes whether or not the represented search space includes paths built from + * the specified set of keystroke input distributions. The distribution count + * should match .inputCount - no omissions or extras are permitted. + * + * Designed explicitly for use in unit testing; it's not super-efficient, so + * avoid live use. + * + * Note: it will destroy the array passed into it. + * @param keystrokeDistributions + * @internal + */ + hasInputs(keystrokeDistributions: Distribution[]): boolean; + + /** + * Increases the editing range that will be considered for determining + * correction distances. + */ + increaseMaxEditDistance(): void; + + /** + * Ceases recording locally-reported results. + * + * This should be called once all descendants of this SearchSpace have been + * constructed, allowing them to first build new paths based upon them. + */ + stopTrackingResults(): void; + + /** + * Reports the cost of the lowest-cost / lowest-distance edge held within the + * batcher's search area. + * @returns + */ + readonly currentCost: number; + + /** + * Provides a heuristic for the base cost at this path's depth if the best + * individual input were taken here, regardless of whether or not that's + * possible. + * + * This cost is based on the negative log-likelihood of the probability and + * includes the cost from the lowest possible parent nodes visited. + */ + readonly lowestPossibleSingleCost: number; + + /** + * Returns the set of previously-processed results under this batcher's domain. + */ + readonly previousResults: SearchResult[]; + + /** + * When true, this indicates that the currently-represented portion of context + * has fat-finger data available, which itself indicates that the user has + * corrections enabled. + */ + readonly correctionsEnabled: boolean; + + /** + * Reports the total number of input keystrokes represented by this + * graph/subgraph. + * + * (Their fat-finger alternates, when provided, do not influence this count - + * they're associated with the original keystroke that affected the context.) + */ + readonly inputCount: number; + + /** + * Retrieves the sequences of inputs that led to this SearchSpace. + */ + readonly inputSequences: Distribution[][]; + + /** + * Reports the length in codepoints of corrected text represented by completed + * paths from this instance. + */ + readonly codepointLength: number; + + /** + * Determines the best example text representable by this batcher's portion of + * the correction-search graph and its paths. + */ + readonly bestExample: { text: string, p: number }; + + /** + * Increases the editing range that will be considered for determining + * correction distances. + */ + increaseMaxEditDistance(): void; +} \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts index e9bc98c80f6..77f1361a810 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts @@ -7,7 +7,7 @@ import { applySuggestionCasing, correctAndEnumerate, dedupeSuggestions, finalize import { detectCurrentCasing, determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; import { ContextTracker } from './correction/context-tracker.js'; -import { DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL } from './correction/search-path.js'; +import { DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL } from './correction/distance-modeler.js'; import CasingForm = LexicalModelTypes.CasingForm; import Configuration = LexicalModelTypes.Configuration; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 0ebb7b84447..fc6123f0459 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -5,10 +5,13 @@ import { defaultWordbreaker, WordBreakProperty } from '@keymanapp/models-wordbre import TransformUtils from './transformUtils.js'; import { determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; +import { ContextTokenization } from './correction/context-tokenization.js'; import { ContextTracker } from './correction/context-tracker.js'; import { ContextState, determineContextSlideTransform } from './correction/context-state.js'; import { ExecutionTimer } from './correction/execution-timer.js'; import ModelCompositor from './model-compositor.js'; +import { ContextTransition } from './test-index.js'; +import { getBestMatches } from './correction/distance-modeler.js'; const searchForProperty = defaultWordbreaker.searchForProperty; @@ -23,7 +26,6 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import SuggestionTag = LexicalModelTypes.SuggestionTag; import Transform = LexicalModelTypes.Transform; -import { ContextTransition, getBestMatches } from './test-index.js'; /* * The functions in this file exist to provide unit-testable stateless components for the @@ -325,6 +327,7 @@ export function determineContextTransition( */ export function determineSuggestionAlignment( transition: ContextTransition, + tokenization: ContextTokenization, lexicalModel: LexicalModel ): { /** @@ -337,7 +340,7 @@ export function determineSuggestionAlignment( */ deleteLeft: number } { - const transitionEdits = transition.final.tokenization.transitionEdits; + const transitionEdits = tokenization.transitionEdits; const context = transition.base.context; const postContext = transition.final.context; const inputTransform = transition.inputDistribution[0].sample; @@ -348,13 +351,13 @@ export function determineSuggestionAlignment( const wordbreak = determineModelWordbreaker(lexicalModel); // Is the token under construction newly-constructed / is there no pre-existing root? - if(transition.preservationTransform && inputTransformMap?.has(1)) { + if(tokenization.taillessTrueKeystroke && inputTransformMap?.has(1)) { return { // If the new token is due to whitespace or due to a different input type // that would likely imply a tokenization boundary, infer 'new word' mode. // Apply any part of the context change that is not considered to be up // for correction. - predictionContext: models.applyTransform(transition.preservationTransform, context), + predictionContext: models.applyTransform(tokenization.taillessTrueKeystroke, context), // As the word/token being corrected/predicted didn't originally exist, // there's no part of it to 'replace'. (Suggestions are applied to the // pre-transform state.) @@ -378,7 +381,7 @@ export function determineSuggestionAlignment( // Did the wordbreaker (or similar) append a blank token before the caret? If so, // preserve that by preventing corrections from triggering left-deletion. - if(transition.final.tokenization.tail.isEmptyToken) { + if(tokenization.tail.isEmptyToken) { deleteLeft = 0; } @@ -452,23 +455,27 @@ export async function correctAndEnumerate( } } - // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the - // same amount of text. We can handle this before the big 'prediction root' loop. - const { predictionContext: predictionContext, deleteLeft } = determineSuggestionAlignment(transition, lexicalModel); - // TODO: Should we filter backspaces & whitespaces out of the transform distribution? // Ideally, the answer (in the future) will be no, but leaving it in right now may pose an issue. // The 'eventual' logic will be significantly more complex, though still manageable. - const searchPath = transition.final.tokenization.tail.searchPath; + const tokenizations = [transition.final.tokenization]; + const searchSpaces = tokenizations.map(t => t.tail.searchSpace); // If corrections are not enabled, bypass the correction search aspect // entirely. No need to 'search' - just do a direct lookup. // // To be clear: this IS how we actually tell that corrections are disabled - // when no fat-finger data is available. - if(!searchPath.correctionsEnabled) { + if(!searchSpaces.find(s => s.correctionsEnabled)) { const wordbreak = determineModelWordbreaker(lexicalModel); + // The one true tokenization: no corrections permitted. + const tokenization = transition.final.tokenization; + + // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the + // same amount of text. We can handle this before the big 'prediction root' loop. + const { predictionContext: predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, lexicalModel); + const predictionRoot = { sample: { insert: wordbreak(transition.final.context), @@ -479,7 +486,7 @@ export async function correctAndEnumerate( }; const predictions = predictFromCorrections(lexicalModel, [predictionRoot], predictionContext); - predictions.forEach((entry) => entry.preservationTransform = transition.preservationTransform); + predictions.forEach((entry) => entry.preservationTransform = tokenization.taillessTrueKeystroke); // Only one 'correction' / prediction root is allowed - the actual text. return { @@ -493,9 +500,16 @@ export async function correctAndEnumerate( let rawPredictions: CorrectionPredictionTuple[] = []; let bestCorrectionCost: number; const correctionPredictionMap: Record> = {}; - for await(const match of getBestMatches(searchPath, timer)) { + for await(const match of getBestMatches(searchSpaces, timer)) { // Corrections obtained: now to predict from them! const correction = match.matchString; + const searchSpace = searchSpaces.find(s => s.spaceId == match.spaceId); + const tokenization = tokenizations.find(t => t.spaceId == match.spaceId); + + // No matter the prediction, once we know the root of the prediction, we'll + // always 'replace' the same amount of text. We can handle this before the + // big 'prediction root' loop. + const { predictionContext, deleteLeft } = determineSuggestionAlignment(transition, tokenization, lexicalModel); // If our 'match' results in fully deleting the new token, reject it and try again. if(match.matchSequence.length == 0 && match.inputSequence.length != 0) { @@ -532,7 +546,7 @@ export async function correctAndEnumerate( * Worst-case, it's possible to temporarily add normalization if a code deep-dive * is needed in the future. */ - if(searchPath.inputSequence.length <= 1) { + if(searchSpace.inputCount <= 1) { /* Suppose a key distribution: most likely with p=0.5, second-most with 0.4 - a pretty * ambiguous case that would only arise very near the center of the boundary between two keys. * Raising (0.5/0.4)^16 ~= 35.53. (At time of writing, SINGLE_CHAR_KEY_PROB_EXPONENT = 16.) @@ -553,7 +567,7 @@ export async function correctAndEnumerate( }; let predictions = predictFromCorrections(lexicalModel, [predictionRoot], predictionContext); - predictions.forEach((entry) => entry.preservationTransform = transition.preservationTransform); + predictions.forEach((entry) => entry.preservationTransform = tokenization.taillessTrueKeystroke); // Only set 'best correction' cost when a correction ACTUALLY YIELDS predictions. if(predictions.length > 0 && bestCorrectionCost === undefined) { diff --git a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts index 2ec0e8b856b..dfe78b96292 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts @@ -5,7 +5,9 @@ export * from './correction/context-tokenization.js'; export { ContextTracker } from './correction/context-tracker.js'; export { ContextTransition } from './correction/context-transition.js'; export * from './correction/distance-modeler.js'; +export * from './correction/search-cluster.js'; export * from './correction/search-path.js'; +export * from './correction/search-space.js'; export { ExtendedEditOperation, SegmentableDistanceCalculation } from './correction/segmentable-calculation.js'; export * from './correction/tokenization-subsets.js'; export * as correction from './correction/index.js'; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts index cd36833e2ff..9eb9b661324 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-state.tests.ts @@ -13,7 +13,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { ContextState, determineContextSlideTransform, models } from '@keymanapp/lm-worker/test-index'; +import { ContextState, determineContextSlideTransform, models, SearchPath } from '@keymanapp/lm-worker/test-index'; import Context = LexicalModelTypes.Context; import Transform = LexicalModelTypes.Transform; @@ -248,21 +248,15 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); // We want to preserve the added whitespace when predicting a token that follows after it. - assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 }); + assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 }); // The 'wordbreak' transform let state = newContextMatch?.final; - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchPath.inputSequence); - assert.sameDeepMembers( - state.tokenization.tokens[state.tokenization.tokens.length - 1].searchPath.inputSequence, - [[{sample: { insert: '', deleteLeft: 0 }, p: 1}]] - ); - - // if(!newContextMatch.final.tokenization.alignment.canAlign) { - // assert.fail("context alignment failed"); - // } - // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); + // space transform + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputCount, 1); + // empty transform + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputCount, 1); + assert.deepEqual((state.tokenization.tail.searchSpace.parents[0] as SearchPath).lastInput, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]); }); it("properly matches and aligns when whitespace before final empty token is extended", function() { @@ -280,21 +274,16 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); // We want to preserve the added whitespace when predicting a token that follows after it. - assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 }); + assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 }); // The 'wordbreak' transform let state = newContextMatch?.final; - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchPath.inputSequence); - assert.deepEqual( - state.tokenization.tokens[state.tokenization.tokens.length - 1].searchPath.inputSequence, - [[{ sample: {insert: '', deleteLeft: 0}, p: 1 }]] - ); - - // if(!newContextMatch.final.tokenization.alignment.canAlign) { - // assert.fail("context alignment failed"); - // } - // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 0); + // Two whitespaces, one of which is new! + const preTail = state.tokenization.tokens[state.tokenization.tokens.length - 2]; + assert.equal(preTail.searchSpace.inputCount, 2); + assert.deepEqual((preTail.searchSpace.parents[0] as SearchPath).lastInput, [{sample: transform, p: 1}]); + assert.equal(state.tokenization.tail.searchSpace.inputCount, 1); + assert.deepEqual((state.tokenization.tail.searchSpace.parents[0] as SearchPath).lastInput, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]); }); it("properly matches and aligns when a 'wordbreak' is removed via backspace", function() { @@ -311,12 +300,6 @@ describe('ContextState', () => { let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform)); assert.isOk(newContextMatch?.final); assert.deepEqual(newContextMatch?.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - - // if(!newContextMatch.final.tokenization.alignment.canAlign) { - // assert.fail("context alignment failed"); - // } - // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, -2); }); it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() { @@ -333,18 +316,12 @@ describe('ContextState', () => { let newContextMatch = baseState.analyzeTransition(existingContext, toWrapperDistribution(transform)); assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); - assert.deepEqual(newContextMatch.preservationTransform, { insert: '', deleteLeft: 0 }); + assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: '', deleteLeft: 0 }); // The 'wordbreak' transform let state = newContextMatch.final; - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchPath.inputSequence); - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchPath.inputSequence); - - // if(!newContextMatch.final.tokenization.alignment.canAlign) { - // assert.fail("context alignment failed"); - // } - // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 1); + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputCount, 1); + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputCount, 1); }) // Needs improved context-state management (due to 2x tokens) @@ -366,14 +343,13 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); // We want to preserve the added whitespace when predicting a token that follows after it. - assert.deepEqual(newContextMatch.preservationTransform, { insert: ' ', deleteLeft: 0 }); + assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: ' ', deleteLeft: 0 }); // The 'wordbreak' transform let state = newContextMatch.final; - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchPath.inputSequence); - assert.deepEqual( - state.tokenization.tokens[state.tokenization.tokens.length - 1].searchPath.inputSequence, - [[{sample: {insert: '', deleteLeft: 0}, p: 1}]] + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputCount, 1); + assert.equal( + state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputCount, 1 ); // if(!newContextMatch.final.tokenization.alignment.canAlign) { @@ -398,21 +374,14 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); // We want to preserve all text preceding the new token when applying a suggestion. - assert.deepEqual(newContextMatch.preservationTransform, { insert: 'd ', deleteLeft: 0}); + assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: 'd ', deleteLeft: 0}); // The 'wordbreak' transform let state = newContextMatch.final; - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchPath.inputSequence); - assert.deepEqual( - state.tokenization.tokens[state.tokenization.tokens.length - 1].searchPath.inputSequence, - [[{sample: {insert: '', deleteLeft: 0}, p: 1}]] + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputCount, 1); + assert.equal( + state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputCount, 1 ); - - // if(!newContextMatch.final.tokenization.alignment.canAlign) { - // assert.fail("context alignment failed"); - // } - // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); }); it("properly matches and aligns when tail token is modified AND a 'wordbreak' is added'", function() { @@ -430,21 +399,15 @@ describe('ContextState', () => { assert.isNotNull(newContextMatch?.final); assert.deepEqual(newContextMatch.final.tokenization.tokens.map(token => token.exampleInput), rawTokens); // We want to preserve all text preceding the new token when applying a suggestion. - assert.deepEqual(newContextMatch.preservationTransform, { insert: 'tor ', deleteLeft: 0 }); + assert.deepEqual(newContextMatch.final.tokenization.taillessTrueKeystroke, { insert: 'tor ', deleteLeft: 0 }); // The 'wordbreak' transform let state = newContextMatch.final; - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchPath.inputSequence); - assert.isNotEmpty(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchPath.inputSequence); - - // if(!newContextMatch.final.tokenization.alignment.canAlign) { - // assert.fail("context alignment failed"); - // } - // assert.equal(newContextMatch.final.tokenization.alignment.leadTokenShift, 0); - // assert.equal(newContextMatch.final.tokenization.alignment.tailTokenShift, 2); + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 2].searchSpace.inputCount, 1); + assert.equal(state.tokenization.tokens[state.tokenization.tokens.length - 1].searchSpace.inputCount, 1); }); - it('handles case where tail token is split into three rather than two', function() { + it.skip('handles case where tail token is split into three rather than two', function() { let baseContext = models.tokenize(defaultBreaker, { left: "text'", startOfBuffer: true, endOfBuffer: true }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index 1cd56e9a2ab..ab6bd7cb809 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -16,7 +16,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { ContextToken, correction, getBestMatches, models, preprocessInputSources } from '@keymanapp/lm-worker/test-index'; -import Distribution = LexicalModelTypes.Distribution; +// import Distribution = LexicalModelTypes.Distribution; import ExecutionTimer = correction.ExecutionTimer; import Transform = LexicalModelTypes.Transform; import TrieModel = models.TrieModel; @@ -25,25 +25,25 @@ import { KMWString } from '@keymanapp/web-utils'; var plainModel = new TrieModel(jsonFixture('models/tries/english-1000'), {wordBreaker: defaultBreaker}); -// https://www.compart.com/en/unicode/block/U+1D400 -const mathBoldUpperA = 0x1D400; // Mathematical Bold Capital A -const mathBoldLowerA = 0x1D41A; // Small A - -function toMathematicalSMP(text: string) { - const chars = [...text]; - - const asSMP = chars.map((c) => { - if(c >= 'a' && c <= 'z') { - return String.fromCodePoint(mathBoldLowerA + (c.charCodeAt(0) - 'a'.charCodeAt(0))); - } else if(c >= 'A' && c <= 'Z') { - return String.fromCodePoint(mathBoldUpperA + (c.charCodeAt(0) - 'A'.charCodeAt(0))); - } else { - return c; - } - }); +// // https://www.compart.com/en/unicode/block/U+1D400 +// const mathBoldUpperA = 0x1D400; // Mathematical Bold Capital A +// const mathBoldLowerA = 0x1D41A; // Small A + +// function toMathematicalSMP(text: string) { +// const chars = [...text]; + +// const asSMP = chars.map((c) => { +// if(c >= 'a' && c <= 'z') { +// return String.fromCodePoint(mathBoldLowerA + (c.charCodeAt(0) - 'a'.charCodeAt(0))); +// } else if(c >= 'A' && c <= 'Z') { +// return String.fromCodePoint(mathBoldUpperA + (c.charCodeAt(0) - 'A'.charCodeAt(0))); +// } else { +// return c; +// } +// }); - return asSMP.join(''); -} +// return asSMP.join(''); +// } describe('ContextToken', function() { before(() => { @@ -54,12 +54,12 @@ describe('ContextToken', function() { it("(model: LexicalModel)", async () => { let token = new ContextToken(plainModel); - assert.isEmpty(token.searchPath.inputSequence); + assert.equal(token.searchSpace.inputCount, 0); assert.isEmpty(token.exampleInput); assert.isFalse(token.isWhitespace); // While searchSpace has no inputs, it _can_ match lexicon entries (via insertions). - let searchIterator = getBestMatches(token.searchPath, new ExecutionTimer(Number.POSITIVE_INFINITY, Number.POSITIVE_INFINITY)); + let searchIterator = getBestMatches([token.searchSpace], new ExecutionTimer(Number.POSITIVE_INFINITY, Number.POSITIVE_INFINITY)); let firstEntry = await searchIterator.next(); assert.isFalse(firstEntry.done); }); @@ -67,11 +67,11 @@ describe('ContextToken', function() { it("(model: LexicalModel, text: string)", () => { let token = new ContextToken(plainModel, "and"); - assert.isNotEmpty(token.searchPath.inputSequence); + assert.isNotEmpty(token.searchSpace.inputSequences?.[0]); - assert.equal(token.searchPath.inputSequence.map((entry) => entry[0].sample.insert).join(''), 'and'); - token.searchPath.inputSequence.forEach((entry) => assert.equal(entry[0].sample.deleteLeft, 0)); - assert.deepEqual(token.searchPath.inputSequence, [..."and"].map((char) => { + assert.equal(token.searchSpace.inputSequences[0].map((entry) => entry[0].sample.insert).join(''), 'and'); + token.searchSpace.inputSequences[0].forEach((entry) => assert.equal(entry[0].sample.deleteLeft, 0)); + assert.deepEqual(token.searchSpace.inputSequences[0], [..."and"].map((char) => { return [{ sample: { insert: char, @@ -82,6 +82,13 @@ describe('ContextToken', function() { })); assert.equal(token.exampleInput, 'and'); + assert.equal(token.searchSpace.inputCount, 3); + assert.isTrue(token.searchSpace.hasInputs([ + [{sample: { insert: 'a', deleteLeft: 0 }, p: 1}], + [{sample: { insert: 'n', deleteLeft: 0 }, p: 1}], + [{sample: { insert: 'd', deleteLeft: 0 }, p: 1}] + ])); + assert.isFalse(token.isWhitespace); }); @@ -90,11 +97,10 @@ describe('ContextToken', function() { let baseToken = new ContextToken(plainModel, "and"); let clonedToken = new ContextToken(baseToken); - assert.equal(clonedToken.searchPath, baseToken.searchPath); + assert.equal(clonedToken.searchSpace, baseToken.searchSpace); // Deep equality on .searchSpace can't be directly checked due to the internal complexities involved. // We CAN check for the most important members, though. - assert.notEqual(clonedToken.searchPath.inputSequence, baseToken.searchPath.inputSequence); - assert.deepEqual(clonedToken.searchPath.inputSequence, baseToken.searchPath.inputSequence); + assert.equal(clonedToken.searchSpace, baseToken.searchSpace); assert.notEqual(clonedToken, baseToken); // Perfectly deep-equal when we ignore .searchSpace. @@ -102,459 +108,465 @@ describe('ContextToken', function() { }); }); - describe("merge()", () => { - it("merges three tokens without previously-split transforms", () => { - const token1 = new ContextToken(plainModel, "can"); - const token2 = new ContextToken(plainModel, "'"); - const token3 = new ContextToken(plainModel, "t"); - - const merged = ContextToken.merge([token1, token2, token3], plainModel); - assert.equal(merged.exampleInput, "can't"); - token1.inputRange.forEach((entry) => assert.isTrue(merged.inputRange.indexOf(entry) > -1)); - token2.inputRange.forEach((entry) => assert.isTrue(merged.inputRange.indexOf(entry) > -1)); - token3.inputRange.forEach((entry) => assert.isTrue(merged.inputRange.indexOf(entry) > -1)); - - assert.sameOrderedMembers(merged.searchPath.inputSequence.slice(0, 3), token1.searchPath.inputSequence); - assert.sameOrderedMembers(merged.searchPath.inputSequence.slice(3, 4), token2.searchPath.inputSequence); - assert.sameOrderedMembers(merged.searchPath.inputSequence.slice(4), token3.searchPath.inputSequence); - }); - - it("merges three tokens from single previously-split transforms", () => { - const srcTransform = { insert: "can't", deleteLeft: 0, deleteRight: 0, id: 1 }; - - const token1 = new ContextToken(plainModel); - const token2 = new ContextToken(plainModel); - const token3 = new ContextToken(plainModel); - - token1.addInput({ - trueTransform: srcTransform, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: {insert: 'can', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); - - token2.addInput({ - trueTransform: srcTransform, - inputStartIndex: 3, - bestProbFromSet: 1 - }, [{sample: {insert: "'", deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); - - token3.addInput({ - trueTransform: srcTransform, - inputStartIndex: 4, - bestProbFromSet: 1 - }, [{sample: {insert: 't', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); - - const merged = ContextToken.merge([token1, token2, token3], plainModel); - assert.equal(merged.exampleInput, "can't"); - assert.deepEqual(merged.inputRange, [ { trueTransform: srcTransform, inputStartIndex: 0, bestProbFromSet: 1 } ]); - assert.deepEqual(merged.searchPath.inputSequence, [[{sample: srcTransform, p: 1}]]); - }); - - it("merges four tokens with previously-split transforms", () => { - // TODO: need another case - pref where there are two diff boundary transforms - // and where each token has multiple constituent transforms. - const srcTransform1 = { insert: "apple", deleteLeft: 0, deleteRight: 0, id: 1 }; - const srcTransform2 = { insert: "sands", deleteLeft: 0, deleteRight: 0, id: 2 }; - const srcTransform3 = { insert: "our", deleteLeft: 0, deleteRight: 0, id: 3 }; - const srcTransform4 = { insert: "grapes", deleteLeft: 0, deleteRight: 0, id: 4 }; - const srcTransforms = [srcTransform1, srcTransform2, srcTransform3, srcTransform4]; - - // apples - const token1 = new ContextToken(plainModel); - // and - const token2 = new ContextToken(plainModel); - // sour - const token3 = new ContextToken(plainModel); - // grapes - const token4 = new ContextToken(plainModel); - const tokensToMerge = [token1, token2, token3, token4] - - token1.addInput({ - trueTransform: srcTransform1, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: srcTransform1, p: 1}]); - token1.addInput({ - trueTransform: srcTransform2, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); - - token2.addInput({ - trueTransform: srcTransform2, - inputStartIndex: 1, - bestProbFromSet: 1 - }, [{sample: {insert: "and", deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); - - token3.addInput({ - trueTransform: srcTransform2, - inputStartIndex: 4, - bestProbFromSet: 1 - }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); - token3.addInput({ - trueTransform: srcTransform3, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: srcTransform3, p: 1}]); - - token4.addInput({ - trueTransform: srcTransform4, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: srcTransform4, p: 1}]); - - const merged = ContextToken.merge(tokensToMerge, plainModel); - assert.equal(merged.exampleInput, "applesandsourgrapes"); - assert.deepEqual(merged.inputRange, srcTransforms.map((t) => ({ trueTransform: t, inputStartIndex: 0, bestProbFromSet: 1 }) )); - assert.deepEqual(merged.searchPath.inputSequence, srcTransforms.map((t) => [{sample: t, p: 1}])); - }); - - it("merges four tokens with previously-split transforms - non-BMP text", () => { - // TODO: need another case - pref where there are two diff boundary transforms - // and where each token has multiple constituent transforms. - const srcTransform1 = { insert: toMathematicalSMP("apple"), deleteLeft: 0, deleteRight: 0, id: 1 }; - const srcTransform2 = { insert: toMathematicalSMP("sands"), deleteLeft: 0, deleteRight: 0, id: 2 }; - const srcTransform3 = { insert: toMathematicalSMP("our"), deleteLeft: 0, deleteRight: 0, id: 3 }; - const srcTransform4 = { insert: toMathematicalSMP("grapes"), deleteLeft: 0, deleteRight: 0, id: 4 }; - const srcTransforms = [srcTransform1, srcTransform2, srcTransform3, srcTransform4]; - - // apples - const token1 = new ContextToken(plainModel); - // and - const token2 = new ContextToken(plainModel); - // sour - const token3 = new ContextToken(plainModel); - // grapes - const token4 = new ContextToken(plainModel); - const tokensToMerge = [token1, token2, token3, token4] - - token1.addInput({ - trueTransform: srcTransform1, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: srcTransform1, p: 1}]); - token1.addInput({ - trueTransform: srcTransform2, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); - - token2.addInput({ - trueTransform: srcTransform2, - inputStartIndex: 1, - bestProbFromSet: 1 - }, [{sample: {insert: toMathematicalSMP("and"), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); - - token3.addInput({ - trueTransform: srcTransform2, - inputStartIndex: 4, - bestProbFromSet: 1 - }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); - token3.addInput({ - trueTransform: srcTransform3, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: srcTransform3, p: 1}]); - - token4.addInput({ - trueTransform: srcTransform4, - inputStartIndex: 0, - bestProbFromSet: 1 - }, [{sample: srcTransform4, p: 1}]); - - const merged = ContextToken.merge(tokensToMerge, plainModel); - assert.equal(merged.exampleInput, toMathematicalSMP("applesandsourgrapes")); - assert.deepEqual(merged.inputRange, srcTransforms.map((t) => ({ trueTransform: t, inputStartIndex: 0, bestProbFromSet: 1 }) )); - assert.deepEqual(merged.searchPath.inputSequence, srcTransforms.map((t) => [{sample: t, p: 1}])); - }); - }); - - describe("split()", () => { - it("handles clean two-way split correctly", () => { - // Setup phase - const keystrokeDistributions: Distribution[] = [ - [ - { sample: { insert: 'c', deleteLeft: 0 }, p: 0.75 }, - { sample: { insert: 't', deleteLeft: 0 }, p: 0.25 } - ], - [ - { sample: { insert: 'a', deleteLeft: 0 }, p: 0.75 }, - { sample: { insert: 'o', deleteLeft: 0 }, p: 0.25 } - ], - [ - { sample: { insert: 'n', deleteLeft: 0 }, p: 0.75 }, - { sample: { insert: 'r', deleteLeft: 0 }, p: 0.25 } - ], - [ - { sample: { insert: '\'', deleteLeft: 0 }, p: 0.75 }, - { sample: { insert: 't', deleteLeft: 0 }, p: 0.25 } - ] - ] - - const tokenToSplit = new ContextToken(plainModel); - for(let i = 0; i < keystrokeDistributions.length; i++) { - tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: .75}, keystrokeDistributions[i]); - }; - - assert.equal(tokenToSplit.sourceText, 'can\''); - assert.deepEqual(tokenToSplit.searchPath.inputSequence, keystrokeDistributions); - - // And now for the "fun" part. - const resultsOfSplit = tokenToSplit.split({ - // Input portion here can be ignored. - input: { - text: 'can\'', - index: 0 - }, matches: [ - // For this part, the text entries are what really matters. - { text: 'can', index: 0, textOffset: 0 }, - { text: '\'', index: 1, textOffset: 3 } - ] - }, plainModel); - - assert.equal(resultsOfSplit.length, 2); - assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), ['can', '\'']); - assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.searchPath.inputSequence), [ - keystrokeDistributions.slice(0, 3), - [keystrokeDistributions[3]] - ]); - }); - - it("handles mid-transform splits correctly", () => { - // Setup phase - const keystrokeDistributions: Distribution[] = [ - [ - { sample: { insert: 'biglargetransform', deleteLeft: 0, deleteRight: 0 }, p: 1 }, - ] - ]; - const splitTextArray = ['big', 'large', 'transform']; - - const tokenToSplit = new ContextToken(plainModel); - for(let i = 0; i < keystrokeDistributions.length; i++) { - tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: 1}, keystrokeDistributions[i]); - }; - - assert.equal(tokenToSplit.sourceText, 'biglargetransform'); - assert.deepEqual(tokenToSplit.searchPath.inputSequence, keystrokeDistributions); - - // And now for the "fun" part. - const resultsOfSplit = tokenToSplit.split({ - // Input portion here can be ignored. - input: { - text: 'biglargetransform', - index: 0 - }, matches: [ - // For this part, the text entries are what really matters. - { text: 'big', index: 0, textOffset: 0 }, - { text: 'large', index: 1, textOffset: 3 }, - { text: 'transform', index: 2, textOffset: 8 } - ] - }, plainModel); - - assert.equal(resultsOfSplit.length, 3); - assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); - assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputRange[0]), [0, 3, 8].map(i => ({ - trueTransform: { - insert: 'biglargetransform', - deleteLeft: 0, - deleteRight: 0 - }, - inputStartIndex: i, - bestProbFromSet: 1 - }))); - assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.searchPath.inputSequence[0]), splitTextArray.map(t => [{ - sample: { insert: t, deleteLeft: 0, deleteRight: 0 }, p: 1 - }])); - }); - - it("handles messy mid-transform splits correctly", () => { - // Setup phase - const keystrokeDistributions: Distribution[] = [ - [ - { sample: { insert: 'long', deleteLeft: 0, deleteRight: 0, id: 11 }, p: 1 } - ], [ - { sample: { insert: 'argelovely', deleteLeft: 3, deleteRight: 0, id: 12 }, p: 1 } - ], [ - { sample: { insert: 'ngtransforms', deleteLeft: 4, deleteRight: 0, id: 13 }, p: 1 } - ] - ]; - const splitTextArray = ['large', 'long', 'transforms']; - - const tokenToSplit = new ContextToken(plainModel); - for(let i = 0; i < keystrokeDistributions.length; i++) { - tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: 1}, keystrokeDistributions[i]); - }; - - assert.equal(tokenToSplit.exampleInput, 'largelongtransforms'); - assert.deepEqual(tokenToSplit.searchPath.inputSequence, keystrokeDistributions); - - // And now for the "fun" part. - const resultsOfSplit = tokenToSplit.split({ - // Input portion here can be ignored. - input: { - text: 'largelongtransforms', - index: 0 - }, matches: [ - // For this part, the text entries are what really matters. - { text: 'large', index: 0, textOffset: 0 }, - { text: 'long', index: 1, textOffset: 5 }, - { text: 'transforms', index: 2, textOffset: 9 } - ] - }, plainModel); - - assert.equal(resultsOfSplit.length, 3); - assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); - assert.deepEqual(resultsOfSplit[0].inputRange, [ - { trueTransform: keystrokeDistributions[0][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, - { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, - ]); - assert.deepEqual(resultsOfSplit[1].inputRange, [ - { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 'arge'.length, bestProbFromSet: 1 }, - { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, - ]); - assert.deepEqual(resultsOfSplit[2].inputRange, [ - { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 'ng'.length, bestProbFromSet: 1 } - ]); - - assert.deepEqual(resultsOfSplit[0].searchPath.inputSequence, [ - keystrokeDistributions[0], - keystrokeDistributions[1].map((entry) => { - return { - sample: { - ...entry.sample, - insert: entry.sample.insert.slice(0, 4) // gets the 'arge' portion & the deleteLefts. - }, p: entry.p - } - }), - ]); - - assert.deepEqual(resultsOfSplit[1].searchPath.inputSequence, [ - keystrokeDistributions[1].map((entry) => { - return { - sample: { - ...entry.sample, - insert: entry.sample.insert.slice('arge'.length), - deleteLeft: 0 - }, p: entry.p - } - }), - keystrokeDistributions[2].map((entry) => { - return { - sample: { - ...entry.sample, - insert: entry.sample.insert.slice(0, 'ng'.length), // gets the 'ng' portion. - }, p: entry.p - } - }), - ]); - - assert.deepEqual(resultsOfSplit[2].searchPath.inputSequence, [ - keystrokeDistributions[2].map((entry) => { - return { - sample: { - ...entry.sample, - insert: entry.sample.insert.slice('ng'.length), // drops the 'ng' portion. - deleteLeft: 0 - }, p: entry.p - } - }), - ]); - }); - - it("handles messy mid-transform splits correctly - non-BMP text", () => { - // Setup phase - const keystrokeDistributions: Distribution[] = [ - [ - { sample: { insert: toMathematicalSMP('long'), deleteLeft: 0, deleteRight: 0, id: 11 }, p: 1 } - ], [ - { sample: { insert: toMathematicalSMP('argelovely'), deleteLeft: 3, deleteRight: 0, id: 12 }, p: 1 } - ], [ - { sample: { insert: toMathematicalSMP('ngtransforms'), deleteLeft: 4, deleteRight: 0, id: 13 }, p: 1 } - ] - ]; - const splitTextArray = ['large', 'long', 'transforms'].map(t => toMathematicalSMP(t)); - - const tokenToSplit = new ContextToken(plainModel); - for(let i = 0; i < keystrokeDistributions.length; i++) { - tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: 1}, keystrokeDistributions[i]); - }; - - assert.equal(tokenToSplit.exampleInput, toMathematicalSMP('largelongtransforms')); - assert.deepEqual(tokenToSplit.searchPath.inputSequence, keystrokeDistributions); - - // And now for the "fun" part. - const resultsOfSplit = tokenToSplit.split({ - // Input portion here can be ignored. - input: { - text: toMathematicalSMP('largelongtransforms'), - index: 0 - }, matches: [ - // For this part, the text entries are what really matters. - { text: toMathematicalSMP('large'), index: 0, textOffset: 0 }, - { text: toMathematicalSMP('long'), index: 1, textOffset: 5 }, - { text: toMathematicalSMP('transforms'), index: 2, textOffset: 9 } - ] - }, plainModel); - - assert.equal(resultsOfSplit.length, 3); - assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); - assert.deepEqual(resultsOfSplit[0].inputRange, [ - { trueTransform: keystrokeDistributions[0][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, - { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, - ]); - assert.deepEqual(resultsOfSplit[1].inputRange, [ - { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 'arge'.length, bestProbFromSet: 1 }, - { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, - ]); - assert.deepEqual(resultsOfSplit[2].inputRange, [ - { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 'ng'.length, bestProbFromSet: 1 } - ]); - - assert.deepEqual(resultsOfSplit[0].searchPath.inputSequence, [ - keystrokeDistributions[0], - keystrokeDistributions[1].map((entry) => { - return { - sample: { - ...entry.sample, - insert: KMWString.substring(entry.sample.insert, 0, 4) // gets the 'arge' portion & the deleteLefts. - }, p: entry.p - } - }), - ]); - - assert.deepEqual(resultsOfSplit[1].searchPath.inputSequence, [ - keystrokeDistributions[1].map((entry) => { - return { - sample: { - ...entry.sample, - insert: KMWString.substring(entry.sample.insert, 'arge'.length), - deleteLeft: 0 - }, p: entry.p - } - }), - keystrokeDistributions[2].map((entry) => { - return { - sample: { - ...entry.sample, - insert: KMWString.substring(entry.sample.insert, 0, 'ng'.length), // gets the 'ng' portion. - }, p: entry.p - } - }), - ]); - - assert.deepEqual(resultsOfSplit[2].searchPath.inputSequence, [ - keystrokeDistributions[2].map((entry) => { - return { - sample: { - ...entry.sample, - insert: KMWString.substring(entry.sample.insert, 'ng'.length), // drops the 'ng' portion. - deleteLeft: 0 - }, p: entry.p - } - }), - ]); - }); - }); + // describe("merge()", () => { + // it("merges three tokens without previously-split transforms", () => { + // const token1 = new ContextToken(plainModel, "can"); + // const token2 = new ContextToken(plainModel, "'"); + // const token3 = new ContextToken(plainModel, "t"); + + // const merged = ContextToken.merge([token1, token2, token3], plainModel); + // assert.equal(merged.exampleInput, "can't"); + // token1.inputRange.forEach((entry) => assert.isTrue(merged.inputRange.indexOf(entry) > -1)); + // token2.inputRange.forEach((entry) => assert.isTrue(merged.inputRange.indexOf(entry) > -1)); + // token3.inputRange.forEach((entry) => assert.isTrue(merged.inputRange.indexOf(entry) > -1)); + + // assert.sameOrderedMembers(merged.searchSpace.inputSequences[0].slice(0, 3), token1.searchSpace.inputSequences[0].slice()); + // assert.sameOrderedMembers(merged.searchSpace.inputSequences[0].slice(3, 4), token2.searchSpace.inputSequences[0].slice()); + // assert.sameOrderedMembers(merged.searchSpace.inputSequences[0].slice(4), token3.searchSpace.inputSequences[0].slice()); + // }); + + // it("merges three tokens from single previously-split transforms", () => { + // const srcTransform = { insert: "can't", deleteLeft: 0, deleteRight: 0, id: 1 }; + + // const token1 = new ContextToken(plainModel); + // const token2 = new ContextToken(plainModel); + // const token3 = new ContextToken(plainModel); + + // token1.addInput({ + // trueTransform: srcTransform, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: {insert: 'can', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); + + // token2.addInput({ + // trueTransform: srcTransform, + // inputStartIndex: 3, + // bestProbFromSet: 1 + // }, [{sample: {insert: "'", deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); + + // token3.addInput({ + // trueTransform: srcTransform, + // inputStartIndex: 4, + // bestProbFromSet: 1 + // }, [{sample: {insert: 't', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); + + // const merged = ContextToken.merge([token1, token2, token3], plainModel); + // assert.equal(merged.exampleInput, "can't"); + // assert.deepEqual(merged.inputRange, [ { trueTransform: srcTransform, inputStartIndex: 0, bestProbFromSet: 1 } ]); + // assert.equal(merged.searchSpace.inputCount, 1); + // assert.deepEqual((merged.searchSpace as SearchPath).lastInput, [{sample: srcTransform, p: 1}]); + // }); + + // it("merges four tokens with previously-split transforms", () => { + // // TODO: need another case - pref where there are two diff boundary transforms + // // and where each token has multiple constituent transforms. + // const srcTransform1 = { insert: "apple", deleteLeft: 0, deleteRight: 0, id: 1 }; + // const srcTransform2 = { insert: "sands", deleteLeft: 0, deleteRight: 0, id: 2 }; + // const srcTransform3 = { insert: "our", deleteLeft: 0, deleteRight: 0, id: 3 }; + // const srcTransform4 = { insert: "grapes", deleteLeft: 0, deleteRight: 0, id: 4 }; + // const srcTransforms = [srcTransform1, srcTransform2, srcTransform3, srcTransform4]; + + // // apples + // const token1 = new ContextToken(plainModel); + // // and + // const token2 = new ContextToken(plainModel); + // // sour + // const token3 = new ContextToken(plainModel); + // // grapes + // const token4 = new ContextToken(plainModel); + // const tokensToMerge = [token1, token2, token3, token4] + + // token1.addInput({ + // trueTransform: srcTransform1, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: srcTransform1, p: 1}]); + // token1.addInput({ + // trueTransform: srcTransform2, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); + + // token2.addInput({ + // trueTransform: srcTransform2, + // inputStartIndex: 1, + // bestProbFromSet: 1 + // }, [{sample: {insert: "and", deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); + + // token3.addInput({ + // trueTransform: srcTransform2, + // inputStartIndex: 4, + // bestProbFromSet: 1 + // }, [{sample: {insert: 's', deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); + // token3.addInput({ + // trueTransform: srcTransform3, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: srcTransform3, p: 1}]); + + // token4.addInput({ + // trueTransform: srcTransform4, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: srcTransform4, p: 1}]); + + // const merged = ContextToken.merge(tokensToMerge, plainModel); + // assert.equal(merged.exampleInput, "applesandsourgrapes"); + // assert.deepEqual(merged.inputRange, srcTransforms.map((t) => ({ trueTransform: t, inputStartIndex: 0, bestProbFromSet: 1 }) )); + // assert.isTrue(merged.searchSpace.hasInputs( + // srcTransforms.map((t) => ([{sample: t, p: 1}])) + // )); + // }); + + // it("merges four tokens with previously-split transforms - non-BMP text", () => { + // // TODO: need another case - pref where there are two diff boundary transforms + // // and where each token has multiple constituent transforms. + // const srcTransform1 = { insert: toMathematicalSMP("apple"), deleteLeft: 0, deleteRight: 0, id: 1 }; + // const srcTransform2 = { insert: toMathematicalSMP("sands"), deleteLeft: 0, deleteRight: 0, id: 2 }; + // const srcTransform3 = { insert: toMathematicalSMP("our"), deleteLeft: 0, deleteRight: 0, id: 3 }; + // const srcTransform4 = { insert: toMathematicalSMP("grapes"), deleteLeft: 0, deleteRight: 0, id: 4 }; + // const srcTransforms = [srcTransform1, srcTransform2, srcTransform3, srcTransform4]; + + // // apples + // const token1 = new ContextToken(plainModel); + // // and + // const token2 = new ContextToken(plainModel); + // // sour + // const token3 = new ContextToken(plainModel); + // // grapes + // const token4 = new ContextToken(plainModel); + // const tokensToMerge = [token1, token2, token3, token4] + + // token1.addInput({ + // trueTransform: srcTransform1, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: srcTransform1, p: 1}]); + // token1.addInput({ + // trueTransform: srcTransform2, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); + + // token2.addInput({ + // trueTransform: srcTransform2, + // inputStartIndex: 1, + // bestProbFromSet: 1 + // }, [{sample: {insert: toMathematicalSMP("and"), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); + + // token3.addInput({ + // trueTransform: srcTransform2, + // inputStartIndex: 4, + // bestProbFromSet: 1 + // }, [{sample: {insert: toMathematicalSMP('s'), deleteLeft: 0, deleteRight: 0, id: 2}, p: 1}]); + // token3.addInput({ + // trueTransform: srcTransform3, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: srcTransform3, p: 1}]); + + // token4.addInput({ + // trueTransform: srcTransform4, + // inputStartIndex: 0, + // bestProbFromSet: 1 + // }, [{sample: srcTransform4, p: 1}]); + + // const merged = ContextToken.merge(tokensToMerge, plainModel); + // assert.equal(merged.exampleInput, toMathematicalSMP("applesandsourgrapes")); + // assert.deepEqual(merged.inputRange, srcTransforms.map((t) => ({ trueTransform: t, inputStartIndex: 0, bestProbFromSet: 1 }) )); + // assert.isTrue(merged.searchSpace.hasInputs( + // srcTransforms.map((t) => ([{sample: t, p: 1}])) + // )); + // }); + // }); + + // describe("split()", () => { + // it("handles clean two-way split correctly", () => { + // // Setup phase + // const keystrokeDistributions: Distribution[] = [ + // [ + // { sample: { insert: 'c', deleteLeft: 0 }, p: 0.75 }, + // { sample: { insert: 't', deleteLeft: 0 }, p: 0.25 } + // ], + // [ + // { sample: { insert: 'a', deleteLeft: 0 }, p: 0.75 }, + // { sample: { insert: 'o', deleteLeft: 0 }, p: 0.25 } + // ], + // [ + // { sample: { insert: 'n', deleteLeft: 0 }, p: 0.75 }, + // { sample: { insert: 'r', deleteLeft: 0 }, p: 0.25 } + // ], + // [ + // { sample: { insert: '\'', deleteLeft: 0 }, p: 0.75 }, + // { sample: { insert: 't', deleteLeft: 0 }, p: 0.25 } + // ] + // ] + + // const tokenToSplit = new ContextToken(plainModel); + // for(let i = 0; i < keystrokeDistributions.length; i++) { + // tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: .75}, keystrokeDistributions[i]); + // }; + + // assert.equal(tokenToSplit.sourceText, 'can\''); + // tokenToSplit.searchPath.hasInputs([...keystrokeDistributions]); + + // // And now for the "fun" part. + // const resultsOfSplit = tokenToSplit.split({ + // // Input portion here can be ignored. + // input: { + // text: 'can\'', + // index: 0 + // }, matches: [ + // // For this part, the text entries are what really matters. + // { text: 'can', index: 0, textOffset: 0 }, + // { text: '\'', index: 1, textOffset: 3 } + // ] + // }, plainModel); + + // assert.equal(resultsOfSplit.length, 2); + // assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), ['can', '\'']); + // assert.isTrue(resultsOfSplit[0].searchPath.hasInputs(keystrokeDistributions.slice(0, 3))); + // assert.isTrue(resultsOfSplit[1].searchPath.hasInputs([keystrokeDistributions[3]])); + // }); + + // it("handles mid-transform splits correctly", () => { + // // Setup phase + // const keystrokeDistributions: Distribution[] = [ + // [ + // { sample: { insert: 'biglargetransform', deleteLeft: 0, deleteRight: 0 }, p: 1 }, + // ] + // ]; + // const splitTextArray = ['big', 'large', 'transform']; + + // const tokenToSplit = new ContextToken(plainModel); + // for(let i = 0; i < keystrokeDistributions.length; i++) { + // tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: 1}, keystrokeDistributions[i]); + // }; + + // assert.equal(tokenToSplit.sourceText, 'biglargetransform'); + // assert.isTrue(tokenToSplit.searchPath.hasInputs([...keystrokeDistributions])); + + // // And now for the "fun" part. + // const resultsOfSplit = tokenToSplit.split({ + // // Input portion here can be ignored. + // input: { + // text: 'biglargetransform', + // index: 0 + // }, matches: [ + // // For this part, the text entries are what really matters. + // { text: 'big', index: 0, textOffset: 0 }, + // { text: 'large', index: 1, textOffset: 3 }, + // { text: 'transform', index: 2, textOffset: 8 } + // ] + // }, plainModel); + + // assert.equal(resultsOfSplit.length, 3); + // assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); + // assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputRange[0]), [0, 3, 8].map(i => ({ + // trueTransform: { + // insert: 'biglargetransform', + // deleteLeft: 0, + // deleteRight: 0 + // }, + // inputStartIndex: i, + // bestProbFromSet: 1 + // }))); + + // for(let i = 0; i < resultsOfSplit.length; i++) { + // assert.isTrue(resultsOfSplit[i].searchPath.hasInputs([ + // [{sample: { insert: splitTextArray[i], deleteLeft: 0, deleteRight: 0 }, p: 1}] + // ])); + // } + // }); + + // it("handles messy mid-transform splits correctly", () => { + // // Setup phase + // const keystrokeDistributions: Distribution[] = [ + // [ + // { sample: { insert: 'long', deleteLeft: 0, deleteRight: 0, id: 11 }, p: 1 } + // ], [ + // { sample: { insert: 'argelovely', deleteLeft: 3, deleteRight: 0, id: 12 }, p: 1 } + // ], [ + // { sample: { insert: 'ngtransforms', deleteLeft: 4, deleteRight: 0, id: 13 }, p: 1 } + // ] + // ]; + // const splitTextArray = ['large', 'long', 'transforms']; + + // const tokenToSplit = new ContextToken(plainModel); + // for(let i = 0; i < keystrokeDistributions.length; i++) { + // tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: 1}, keystrokeDistributions[i]); + // }; + + // assert.equal(tokenToSplit.exampleInput, 'largelongtransforms'); + // tokenToSplit.searchPath.hasInputs([...keystrokeDistributions]); + + // // And now for the "fun" part. + // const resultsOfSplit = tokenToSplit.split({ + // // Input portion here can be ignored. + // input: { + // text: 'largelongtransforms', + // index: 0 + // }, matches: [ + // // For this part, the text entries are what really matters. + // { text: 'large', index: 0, textOffset: 0 }, + // { text: 'long', index: 1, textOffset: 5 }, + // { text: 'transforms', index: 2, textOffset: 9 } + // ] + // }, plainModel); + + // assert.equal(resultsOfSplit.length, 3); + // assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); + // assert.deepEqual(resultsOfSplit[0].inputRange, [ + // { trueTransform: keystrokeDistributions[0][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, + // { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, + // ]); + // assert.deepEqual(resultsOfSplit[1].inputRange, [ + // { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 'arge'.length, bestProbFromSet: 1 }, + // { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, + // ]); + // assert.deepEqual(resultsOfSplit[2].inputRange, [ + // { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 'ng'.length, bestProbFromSet: 1 } + // ]); + + // assert.isTrue(resultsOfSplit[0].searchPath.hasInputs([ + // keystrokeDistributions[0], + // keystrokeDistributions[1].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: entry.sample.insert.slice(0, 4) // gets the 'arge' portion & the deleteLefts. + // }, p: entry.p + // } + // }), + // ])); + + // assert.isTrue(resultsOfSplit[1].searchPath.hasInputs([ + // keystrokeDistributions[1].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: entry.sample.insert.slice('arge'.length), + // deleteLeft: 0 + // }, p: entry.p + // } + // }), + // keystrokeDistributions[2].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: entry.sample.insert.slice(0, 'ng'.length), // gets the 'ng' portion. + // }, p: entry.p + // } + // }), + // ])); + + // assert.isTrue(resultsOfSplit[2].searchPath.hasInputs([ + // keystrokeDistributions[2].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: entry.sample.insert.slice('ng'.length), // drops the 'ng' portion. + // deleteLeft: 0 + // }, p: entry.p + // } + // }), + // ])); + // }); + + // it("handles messy mid-transform splits correctly - non-BMP text", () => { + // // Setup phase + // const keystrokeDistributions: Distribution[] = [ + // [ + // { sample: { insert: toMathematicalSMP('long'), deleteLeft: 0, deleteRight: 0, id: 11 }, p: 1 } + // ], [ + // { sample: { insert: toMathematicalSMP('argelovely'), deleteLeft: 3, deleteRight: 0, id: 12 }, p: 1 } + // ], [ + // { sample: { insert: toMathematicalSMP('ngtransforms'), deleteLeft: 4, deleteRight: 0, id: 13 }, p: 1 } + // ] + // ]; + // const splitTextArray = ['large', 'long', 'transforms'].map(t => toMathematicalSMP(t)); + + // const tokenToSplit = new ContextToken(plainModel); + // for(let i = 0; i < keystrokeDistributions.length; i++) { + // tokenToSplit.addInput({trueTransform: keystrokeDistributions[i][0].sample, inputStartIndex: 0, bestProbFromSet: 1}, keystrokeDistributions[i]); + // }; + + // assert.equal(tokenToSplit.exampleInput, toMathematicalSMP('largelongtransforms')); + // tokenToSplit.searchPath.hasInputs([...keystrokeDistributions]); + + // // And now for the "fun" part. + // const resultsOfSplit = tokenToSplit.split({ + // // Input portion here can be ignored. + // input: { + // text: toMathematicalSMP('largelongtransforms'), + // index: 0 + // }, matches: [ + // // For this part, the text entries are what really matters. + // { text: toMathematicalSMP('large'), index: 0, textOffset: 0 }, + // { text: toMathematicalSMP('long'), index: 1, textOffset: 5 }, + // { text: toMathematicalSMP('transforms'), index: 2, textOffset: 9 } + // ] + // }, plainModel); + + // assert.equal(resultsOfSplit.length, 3); + // assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); + // assert.deepEqual(resultsOfSplit[0].inputRange, [ + // { trueTransform: keystrokeDistributions[0][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, + // { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, + // ]); + // assert.deepEqual(resultsOfSplit[1].inputRange, [ + // { trueTransform: keystrokeDistributions[1][0].sample, inputStartIndex: 'arge'.length, bestProbFromSet: 1 }, + // { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 0, bestProbFromSet: 1 }, + // ]); + // assert.deepEqual(resultsOfSplit[2].inputRange, [ + // { trueTransform: keystrokeDistributions[2][0].sample, inputStartIndex: 'ng'.length, bestProbFromSet: 1 } + // ]); + + // assert.isTrue(resultsOfSplit[0].searchPath.hasInputs([ + // keystrokeDistributions[0], + // keystrokeDistributions[1].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: KMWString.substring(entry.sample.insert, 0, 4) // gets the 'arge' portion & the deleteLefts. + // }, p: entry.p + // } + // }), + // ])); + + // assert.isTrue(resultsOfSplit[1].searchPath.hasInputs([ + // keystrokeDistributions[1].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: KMWString.substring(entry.sample.insert, 'arge'.length), + // deleteLeft: 0 + // }, p: entry.p + // } + // }), + // keystrokeDistributions[2].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: KMWString.substring(entry.sample.insert, 0, 'ng'.length), // gets the 'ng' portion. + // }, p: entry.p + // } + // }), + // ])); + + // assert.isTrue(resultsOfSplit[2].searchPath.hasInputs([ + // keystrokeDistributions[2].map((entry) => { + // return { + // sample: { + // ...entry.sample, + // insert: KMWString.substring(entry.sample.insert, 'ng'.length), // drops the 'ng' portion. + // deleteLeft: 0 + // }, p: entry.p + // } + // }), + // ])); + // }); + // }); }); describe('preprocessInputSources', () => { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index a5a487e83d1..c18d16509be 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -15,7 +15,7 @@ import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs' import { LexicalModelTypes } from '@keymanapp/common-types'; import { KMWString } from '@keymanapp/web-utils'; -import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, models, PendingTokenization, traceInsertEdits } from '@keymanapp/lm-worker/test-index'; +import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, models, PendingTokenization, SearchPath, traceInsertEdits } from '@keymanapp/lm-worker/test-index'; import Transform = LexicalModelTypes.Transform; import TrieModel = models.TrieModel; @@ -139,16 +139,18 @@ describe('ContextTokenization', function() { let baseTokenization = new ContextTokenization(tokens, transitionEdits, null /* dummy val */); let cloned = new ContextTokenization(baseTokenization); - assert.deepEqual(cloned.tokens.map((token) => token.searchPath.inputSequence), - baseTokenization.tokens.map((token) => token.searchPath.inputSequence)); + assert.sameOrderedMembers( + cloned.tokens.map((token) => token.searchSpace), + baseTokenization.tokens.map((token) => token.searchSpace) + ); // The `.searchSpace` instances will not be deep-equal; there are class properties // that hold functions with closures, configured at runtime. // @ts-ignore - TS2704 b/c deleting a readonly property. - baseTokenization.tokens.forEach((token) => delete token.searchPath); + baseTokenization.tokens.forEach((token) => delete token.searchSpace); // @ts-ignore - TS2704 b/c deleting a readonly property. - cloned.tokens.forEach((token) => delete token.searchPath); + cloned.tokens.forEach((token) => delete token.searchSpace); assert.deepEqual(cloned, baseTokenization); }); @@ -198,13 +200,19 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), targetTokens ); - assert.includeDeepMembers( - tokenization.tokens[tokenization.tokens.length - 2].searchPath.inputSequence, - [[{sample: inputTransformMap.get(1), p: 1}]] + assert.equal( + tokenization.tokens[tokenization.tokens.length - 2].searchSpace.inputCount, 1 + ); + assert.deepEqual( + (tokenization.tokens[tokenization.tokens.length - 2].searchSpace.parents[0] as SearchPath).lastInput, + [{sample: inputTransformMap.get(1), p: 1}] ); - assert.includeDeepMembers( - tokenization.tail.searchPath.inputSequence, - [[{sample: inputTransformMap.get(2), p: 1}]] + assert.equal( + tokenization.tail.searchSpace.inputCount, 1 + ); + assert.deepEqual( + (tokenization.tail.searchSpace.parents[0] as SearchPath).lastInput, + [{sample: inputTransformMap.get(2), p: 1}] ); }); @@ -281,9 +289,15 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), targetTokens ); - assert.includeDeepMembers( - tokenization.tail.searchPath.inputSequence, - [[{sample: inputTransformMap.get(0), p: 1}]] + assert.equal(baseTokenization.tail.searchSpace.inputCount, 2); + // TODO: .hasParent() (so we don't have to worry about the specific placement) + assert.deepEqual(tokenization.tail.searchSpace.parents[0].parents, [baseTokenization.tail.searchSpace]); + assert.equal( + tokenization.tail.searchSpace.inputCount, 3 + ); + assert.deepEqual( + (tokenization.tail.searchSpace.parents[0] as SearchPath).lastInput, + [{sample: inputTransformMap.get(0), p: 1}] ); }); @@ -322,11 +336,22 @@ describe('ContextTokenization', function() { assert.deepEqual(tokenization.tokens.map((t) => ({text: t.exampleInput, isWhitespace: t.isWhitespace})), targetTokens ); - assert.includeDeepMembers( - tokenization.tail.searchPath.inputSequence, + + // As we fully deleted the old token, the new one "starts" after the deleteLeft. + // The deleteLeft component should not be included here. Mocking may be needed! + assert.equal( + tokenization.tail.searchSpace.inputCount, 1 // is a single transform. + ); + assert.equal( + tokenization.tokens[tokenization.tokens.length - 2].searchSpace, + baseTokenization.tokens[tokenization.tokens.length - 2].searchSpace + ) + assert.notEqual(tokenization.tail.searchSpace.parents, [baseTokenization.tail.searchSpace]); + assert.deepEqual( + (tokenization.tail.searchSpace.parents[0] as SearchPath).lastInput, // As we fully deleted the old token, the new one "starts" after the deleteLeft. // The deleteLeft component should not be included here. - [[{sample: { insert: 'week', deleteLeft: 0 /* NOT 3 */ }, p: 1}]] + [{sample: { insert: 'week', deleteLeft: 0 /* NOT 3 */ }, p: 1}] ); }); @@ -376,9 +401,8 @@ describe('ContextTokenization', function() { transform.deleteLeft = 0; } - assert.includeDeepMembers( - tokenization.tokens[tailIndex + i].searchPath.inputSequence, - [[{sample: transform, p: 1}]] + assert.deepEqual((tokenization.tokens[tailIndex + i].searchSpace.parents[0] as SearchPath).lastInput, + [{sample: transform, p: 1}] ); } }); @@ -438,9 +462,8 @@ describe('ContextTokenization', function() { transform.deleteLeft = 0; } - assert.includeDeepMembers( - tokenization.tokens[tailIndex + i].searchPath.inputSequence, - [[{sample: transform, p: 1}]] + assert.deepEqual((tokenization.tokens[tailIndex + i].searchSpace.parents[0] as SearchPath).lastInput, + [{sample: transform, p: 1}] ); } }); @@ -492,21 +515,20 @@ describe('ContextTokenization', function() { transform.deleteLeft = 0; } - assert.includeDeepMembers( - tokenization.tokens[tailIndex + i].searchPath.inputSequence, - [[{sample: transform, p: 1}]] + assert.deepEqual((tokenization.tokens[tailIndex + i].searchSpace.parents[0] as SearchPath).lastInput, + [{sample: transform, p: 1}] ); } }); - it('handles case that triggers a token merge: can+\'+t', () => { + it.skip('handles case that triggers a token merge: can+\'+t', () => { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can', '\'']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); const targetTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'t'].map((t) => ({text: t, isWhitespace: t == ' '})); const inputTransform = { insert: 't', deleteLeft: 0, deleteRight: 0 }; const inputTransformMap: Map = new Map(); - inputTransformMap.set(0, { insert: 't', deleteLeft: 0 }); + inputTransformMap.set(0, inputTransform); const edgeWindow = buildEdgeWindow(baseTokenization.tokens, inputTransform, false, testEdgeWindowSpec); const tokenization = baseTokenization.evaluateTransition({ @@ -548,26 +570,21 @@ describe('ContextTokenization', function() { targetTokens ); - assert.includeDeepMembers( - [...tokenization.tail.inputRange], - [...baseTokenization.tokens[baseTokenization.tokens.length - 2].inputRange] - ); - assert.includeDeepMembers( - tokenization.tail.searchPath.inputSequence, - baseTokenization.tokens[baseTokenization.tokens.length - 2].searchPath.inputSequence - ); - - assert.includeDeepMembers( - [...tokenization.tail.inputRange], - [...baseTokenization.tokens[baseTokenization.tokens.length - 1].inputRange] - ); - assert.includeDeepMembers( - tokenization.tail.searchPath.inputSequence, - baseTokenization.tokens[baseTokenization.tokens.length - 1].searchPath.inputSequence + const basePreTail = baseTokenization.tokens[baseTokenization.tokens.length - 2]; + const baseTail = baseTokenization.tail; + assert.equal( + tokenization.tail.searchSpace.inputCount, + basePreTail.searchSpace.inputCount + baseTail.searchSpace.inputCount + 1 /* +1 - incoming transform */ ); + assert.deepEqual((tokenization.tail.searchSpace as SearchPath).lastInput, [{ sample: inputTransform, p: 1 }]); + assert.equal(tokenization.tail.exampleInput, 'can\'t'); + assert.deepEqual(tokenization.tail.searchSpace.bestExample, { + text: basePreTail.searchSpace.bestExample.text + baseTail.searchSpace.bestExample.text + inputTransform.insert, + p: basePreTail.searchSpace.bestExample.p * baseTail.searchSpace.bestExample.p * 1 /* prob of input transform */ + }); }); - it('handles case that triggers a token split: can\' +. => can, \', .', () => { + it.skip('handles case that triggers a token split: can\' +. => can, \', .', () => { const baseTokens = ['an', ' ', 'apple', ' ', 'a', ' ', 'day', ' ', 'can\'']; const baseTokenization = new ContextTokenization(baseTokens.map(t => toToken(t))); @@ -619,25 +636,24 @@ describe('ContextTokenization', function() { targetTokens ); - assert.includeDeepMembers( - [...baseTokenization.tail.inputRange], - [...tokenization.tokens[tokenization.tokens.length - 2].inputRange] - ); - assert.includeDeepMembers( - baseTokenization.tail.searchPath.inputSequence, - tokenization.tokens[tokenization.tokens.length - 2].searchPath.inputSequence - ); - - // We've also appended a '.' to the final split-off token. Thus, we need - // to account for that in the assertions below. - assert.includeDeepMembers( - [...baseTokenization.tail.inputRange, { trueTransform: inputTransform, inputStartIndex: 0, bestProbFromSet: 1 }], - [...tokenization.tokens[tokenization.tokens.length - 1].inputRange] - ); - assert.includeDeepMembers( - [...baseTokenization.tail.searchPath.inputSequence, [{sample: { insert: '.', deleteLeft: 0 }, p: 1}]], - tokenization.tokens[tokenization.tokens.length - 1].searchPath.inputSequence + const prepreTail = tokenization.tokens[tokenization.tokens.length - 3]; + const preTail = tokenization.tokens[tokenization.tokens.length - 2]; + const tail = tokenization.tail; + assert.equal( + baseTokenization.tail.searchSpace.inputCount, + prepreTail.searchSpace.inputCount + preTail.searchSpace.inputCount ); + assert.equal(tail.searchSpace.inputCount, 1); + // base tokenization did not include the '.' component. + assert.deepEqual((preTail.searchSpace as SearchPath).lastInput, (baseTokenization.tail.searchSpace as SearchPath).lastInput); + assert.deepEqual((tail.searchSpace as SearchPath).lastInput, [{sample: inputTransformMap.get(1), p: 1}]); + assert.equal(prepreTail.exampleInput, 'can'); + assert.equal(preTail.exampleInput, '\''); + assert.equal(tail.exampleInput, '.'); + assert.deepEqual({ + text: prepreTail.searchSpace.bestExample.text + preTail.searchSpace.bestExample.text, + p: prepreTail.searchSpace.bestExample.p * preTail.searchSpace.bestExample.p + }, baseTokenization.tail.searchSpace.bestExample); }); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts index a8fc06ab9dc..6185069c7b3 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-transition.tests.ts @@ -56,7 +56,6 @@ describe('ContextTransition', () => { assert.equal(transition.transitionId, 1); assert.isNotOk(transition.final); assert.isNotOk(transition.inputDistribution); - assert.isNotOk(transition.preservationTransform); }); it('deep-copies when given a previous ContextState instance (no `final`)', () => { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts index c72917bed73..abf9402f701 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts @@ -33,6 +33,8 @@ const toKey = (s: string) => testModel.toKey(s); */ const FIRST_CHAR_VARIANTS = 24; +let SEARCH_EDGE_SEED = 0; + function assertEdgeChars(edge: correction.SearchNode, input: string, match: string) { assert.isTrue(edgeHasChars(edge, input, match)); } @@ -46,7 +48,7 @@ function edgeHasChars(edge: correction.SearchNode, input: string, match: string) } function findEdgesWithChars(edgeArray: correction.SearchNode[], match: string) { - let results = edgeArray.filter(function(value) { + const results = edgeArray.filter((value) => { return value.calculation.lastMatchEntry == match; }); @@ -55,7 +57,8 @@ function findEdgesWithChars(edgeArray: correction.SearchNode[], match: string) { } function fetchCommonTENode() { - const rootNode = new SearchNode(testModel.traverseFromRoot(), toKey); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new SearchNode(testModel.traverseFromRoot(), rootSeed, toKey); // Establish desired source node: prefix 'te'. const firstLayerTransforms: Distribution = [{ @@ -71,14 +74,16 @@ function fetchCommonTENode() { }, p: 0.25 }]; + const firstLayerSeed = SEARCH_EDGE_SEED++; const firstLayerNodes = rootNode - .buildSubstitutionEdges(firstLayerTransforms) + .buildSubstitutionEdges(firstLayerTransforms, firstLayerSeed) .flatMap(node => node.processSubsetEdge()); assert.isAbove(firstLayerNodes.length, FIRST_CHAR_VARIANTS); firstLayerNodes.sort((a, b) => a.currentCost - b.currentCost); const tNode = firstLayerNodes[0]; assert.equal(tNode.resultKey, 't'); + assert.equal(tNode.spaceId, firstLayerSeed); assert.sameDeepMembers(tNode.priorInput, [firstLayerTransforms[0]]); assert.isFalse(tNode.hasPartialInput); @@ -95,8 +100,9 @@ function fetchCommonTENode() { }, p: 0.2 }]; + const secondLayerSeed = SEARCH_EDGE_SEED++; const secondLayerNodes = tNode - .buildSubstitutionEdges(secondLayerTransforms) + .buildSubstitutionEdges(secondLayerTransforms, secondLayerSeed) .flatMap(node => node.processSubsetEdge()); // The field narrows down at this point, but still has a decent number // of variants (11). @@ -107,6 +113,7 @@ function fetchCommonTENode() { assert.equal(teNode.resultKey, 'te'); assert.equal(teNode.editCount, 0); + assert.equal(teNode.spaceId, secondLayerSeed); return teNode; } @@ -114,7 +121,8 @@ function fetchCommonTENode() { describe('Correction Distance Modeler', () => { describe('SearchNode', () => { it('constructs a fresh instance from a traversal + keyingFunction', () => { - const rootNode = new SearchNode(testModel.traverseFromRoot(), toKey); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new SearchNode(testModel.traverseFromRoot(), rootSeed, toKey); assert.equal(rootNode.resultKey, ''); assert.equal(rootNode.editCount, 0); @@ -127,11 +135,12 @@ describe('Correction Distance Modeler', () => { assert.isUndefined(rootNode.calculation.lastInputEntry); assert.isUndefined(rootNode.calculation.lastMatchEntry); assert.equal(rootNode.toKey, toKey); + assert.equal(rootNode.spaceId, rootSeed); }); describe('supports the cloning constructor pattern', () => { it('properly deep-copies the root node', () => { - const originalNode = new SearchNode(testModel.traverseFromRoot(), toKey); + const originalNode = new SearchNode(testModel.traverseFromRoot(), SEARCH_EDGE_SEED++, toKey); const clonedNode = new SearchNode(originalNode); // Root node properties; may as well re-assert 'em. @@ -147,6 +156,7 @@ describe('Correction Distance Modeler', () => { assert.isUndefined(clonedNode.calculation.lastInputEntry); assert.isUndefined(clonedNode.calculation.lastMatchEntry); assert.equal(clonedNode.toKey, toKey); + assert.equal(clonedNode.spaceId, originalNode.spaceId); // Avoid aliasing for properties holding mutable objects assert.notEqual(clonedNode.priorInput, originalNode.priorInput); @@ -157,7 +167,8 @@ describe('Correction Distance Modeler', () => { }); it('properly deep-copies fully-processed nodes later in the search path', () => { - const rootNode = new SearchNode(testModel.traverseFromRoot(), toKey); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new SearchNode(testModel.traverseFromRoot(), rootSeed, toKey); // Establish desired source node: prefix 'te'. const firstLayerTransforms: Distribution = [{ @@ -174,7 +185,7 @@ describe('Correction Distance Modeler', () => { p: 0.25 }]; const firstLayerNodes = rootNode - .buildSubstitutionEdges(firstLayerTransforms) + .buildSubstitutionEdges(firstLayerTransforms, SEARCH_EDGE_SEED++) .flatMap(node => node.processSubsetEdge()); assert.isAbove(firstLayerNodes.length, FIRST_CHAR_VARIANTS); firstLayerNodes.sort((a, b) => a.currentCost - b.currentCost); @@ -197,8 +208,9 @@ describe('Correction Distance Modeler', () => { }, p: 0.2 }]; + const secondSpaceId = SEARCH_EDGE_SEED++; const secondLayerNodes = tNode - .buildSubstitutionEdges(secondLayerTransforms) + .buildSubstitutionEdges(secondLayerTransforms, secondSpaceId) .flatMap(node => node.processSubsetEdge()); assert.isAbove(secondLayerNodes.length, 10); secondLayerNodes.sort((a, b) => a.currentCost - b.currentCost); @@ -220,6 +232,7 @@ describe('Correction Distance Modeler', () => { assert.equal(node.calculation.lastMatchEntry, 'e'); assert.equal((node.currentTraversal as TrieTraversal).prefix, 'te'); assert.equal(node.toKey, toKey); + assert.equal(node.spaceId, secondSpaceId); } assertSourceNodeProps(teNode); @@ -238,7 +251,8 @@ describe('Correction Distance Modeler', () => { }); it('properly deep-copies partially-processed edges later in the search path', () => { - const rootNode = new SearchNode(testModel.traverseFromRoot(), toKey); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new SearchNode(testModel.traverseFromRoot(), rootSeed, toKey); // Establish desired source node: prefix 'te', with 'e' the first letter of // a multi-char insert Transform. @@ -256,7 +270,7 @@ describe('Correction Distance Modeler', () => { p: 0.25 }]; const firstLayerNodes = rootNode - .buildSubstitutionEdges(firstLayerTransforms) + .buildSubstitutionEdges(firstLayerTransforms, SEARCH_EDGE_SEED++) .flatMap(node => node.processSubsetEdge()); assert.isAbove(firstLayerNodes.length, FIRST_CHAR_VARIANTS); firstLayerNodes.sort((a, b) => a.currentCost - b.currentCost); @@ -279,8 +293,10 @@ describe('Correction Distance Modeler', () => { }, p: 0.2 }]; + + const secondLayerId = SEARCH_EDGE_SEED++; const secondLayerNodes = tNode - .buildSubstitutionEdges(secondLayerTransforms) + .buildSubstitutionEdges(secondLayerTransforms, secondLayerId) .flatMap(node => node.processSubsetEdge()); assert.isAbove(secondLayerNodes.length, 10); secondLayerNodes.sort((a, b) => a.currentCost - b.currentCost); @@ -302,6 +318,7 @@ describe('Correction Distance Modeler', () => { assert.equal(node.calculation.lastMatchEntry, 'e'); assert.equal((node.currentTraversal as TrieTraversal).prefix, 'te'); assert.equal(node.toKey, toKey); + assert.equal(node.spaceId, secondLayerId); } assertSourceNodeProps(teNode); @@ -321,25 +338,27 @@ describe('Correction Distance Modeler', () => { }); // Consider adding more, deeper? - it('builds insertion edges based on lexicon, from root', function() { - let rootTraversal = testModel.traverseFromRoot(); + it('builds insertion edges based on lexicon, from root', () => { + const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - let rootNode = new correction.SearchNode(rootTraversal); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new correction.SearchNode(rootTraversal, rootSeed); assert.equal(rootNode.calculation.getHeuristicFinalCost(), 0); - let edges = rootNode.buildInsertionEdges(); + const edges = rootNode.buildInsertionEdges(); assert.isAbove(edges.length, 0); let expectedChildCount = 0; - for(let child of rootTraversal.children()) { + for(const child of rootTraversal.children()) { expectedChildCount++; - let childEdge = edges.filter(value => value.calculation.lastMatchEntry == child.char)[0]; + const childEdge = edges.filter(value => value.calculation.lastMatchEntry == child.char)[0]; assert.isOk(childEdge); assert.isEmpty(childEdge.priorInput); assert.isEmpty(childEdge.calculation.inputSequence); assert.isAbove(childEdge.currentCost, 0); + assert.equal(childEdge.spaceId, rootSeed); } assert.equal(edges.length, expectedChildCount); @@ -363,13 +382,15 @@ describe('Correction Distance Modeler', () => { ]; it('step 1: batches deletion edge(s) for input transforms', () => { - let rootTraversal = testModel.traverseFromRoot(); + const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - let rootNode = new correction.SearchNode(rootTraversal); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new correction.SearchNode(rootTraversal, rootSeed); assert.equal(rootNode.calculation.getHeuristicFinalCost(), 0); - const subsetNodes = rootNode.buildDeletionEdges(synthDistribution); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); assert.equal(subsetNodes.length, 4); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); const expectedCosts = [0.5, .25, 0.15, 0.1].map(x => -Math.log(x)) @@ -378,6 +399,7 @@ describe('Correction Distance Modeler', () => { assert.isTrue(subsetNodes[i].hasPartialInput); // From root: the deleteLeft 1 entries have nothing to delete. assert.equal((subsetNodes[i].currentTraversal as TrieTraversal).prefix, ''); + assert.equal(subsetNodes[i].spaceId, subsetSeed); // Allow a little value wiggle due to double-precision limitations. assert.approximately(subsetNodes[i].inputSamplingCost, expectedCosts[i], 1e-8); @@ -388,15 +410,18 @@ describe('Correction Distance Modeler', () => { it('step 2: first processing layer resolves zero + one char inserts', () => { // From "step 1" above, assertions removed - let rootTraversal = testModel.traverseFromRoot(); - let rootNode = new correction.SearchNode(rootTraversal); - const subsetNodes = rootNode.buildDeletionEdges(synthDistribution); + const rootTraversal = testModel.traverseFromRoot(); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new correction.SearchNode(rootTraversal, rootSeed); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); const processedNodes = subsetNodes.flatMap(n => n.processSubsetEdge()); // All delete-oriented Transform subsets condense down to a single // transform (and edge) each. assert.equal(processedNodes.length, 4); + processedNodes.forEach((n) => assert.equal(n.spaceId, subsetSeed)); // Sorted index 0: 1 insert - should be processed already, in a single // step. @@ -431,9 +456,11 @@ describe('Correction Distance Modeler', () => { it('step 3: second processing layer resolves two char inserts', () => { // From "steps 0, 1" above, assertions removed - let rootTraversal = testModel.traverseFromRoot(); - let rootNode = new correction.SearchNode(rootTraversal); - const subsetNodes = rootNode.buildDeletionEdges(synthDistribution); + const rootTraversal = testModel.traverseFromRoot(); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new correction.SearchNode(rootTraversal, rootSeed); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildDeletionEdges(synthDistribution, subsetSeed); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); // Two nodes were unprocessed at the end of the last step; we handle @@ -450,6 +477,7 @@ describe('Correction Distance Modeler', () => { assert.isFalse(processedNode.hasPartialInput); assert.isFalse(processedNode.hasPartialInput); assert.equal(processedNode.editCount, 2); + assert.equal(processedNode.spaceId, subsetSeed); assert.equal(processedNode.calculation.inputSequence.length, 2); assert.equal(processedNode.calculation.lastInputEntry, SENTINEL_CODE_UNIT); assert.equal(processedNode.inputSamplingCost, step2Nodes[index].inputSamplingCost); @@ -481,7 +509,8 @@ describe('Correction Distance Modeler', () => { const teNode = fetchCommonTENode(); assert.equal(teNode.calculation.getHeuristicFinalCost(), 0); - const subsetNodes = teNode.buildDeletionEdges(synthDistribution); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); assert.equal(subsetNodes.length, 4); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); @@ -492,6 +521,8 @@ describe('Correction Distance Modeler', () => { assert.isTrue(subsetNodes[i].hasPartialInput); // From a 'te' prefix, the deleteLeft 1 entries do have something to delete. assert.equal((subsetNodes[i].currentTraversal as TrieTraversal).prefix, (i == 0 || i == 2) ? 'te' : 't'); + assert.notEqual(subsetNodes[i].spaceId, teNode.spaceId); + assert.equal(subsetNodes[i].spaceId, subsetSeed); // Allow a little value wiggle due to double-precision limitations. assert.approximately(subsetNodes[i].inputSamplingCost, expectedCosts[i], 1e-8); @@ -503,13 +534,16 @@ describe('Correction Distance Modeler', () => { it('step 2: first processing layer resolves zero + one char inserts', () => { // From "step 1" above, assertions removed const teNode = fetchCommonTENode(); - const subsetNodes = teNode.buildDeletionEdges(synthDistribution); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); const processedNodes = subsetNodes.flatMap(n => n.processSubsetEdge()); // All delete-oriented Transform subsets condense down to a single // transform (and edge) each. assert.equal(processedNodes.length, 4); + processedNodes.forEach(n => assert.notEqual(n.spaceId, teNode.spaceId)); + processedNodes.forEach(n => assert.equal(n.spaceId, subsetSeed)); // Sorted index 0: 1 insert - should be processed already, in a single // step. @@ -545,7 +579,8 @@ describe('Correction Distance Modeler', () => { it('step 3: second processing layer resolves two char inserts', () => { const teNode = fetchCommonTENode(); - const subsetNodes = teNode.buildDeletionEdges(synthDistribution); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = teNode.buildDeletionEdges(synthDistribution, subsetSeed); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); // Two nodes were unprocessed at the end of the last step; we handle @@ -562,6 +597,8 @@ describe('Correction Distance Modeler', () => { assert.isFalse(processedNode.hasPartialInput); assert.isFalse(processedNode.hasPartialInput); assert.equal(processedNode.editCount, 2); + assert.notEqual(processedNode.spaceId, teNode.spaceId); + assert.equal(processedNode.spaceId, subsetSeed); assert.equal(processedNode.calculation.inputSequence.length, 4 - dl); assert.equal(processedNode.calculation.lastInputEntry, SENTINEL_CODE_UNIT); assert.equal(processedNode.inputSamplingCost, baseNode.inputSamplingCost); @@ -592,13 +629,14 @@ describe('Correction Distance Modeler', () => { ]; it('step 1: batches substitution edge(s) for input transforms', () => { - let rootTraversal = testModel.traverseFromRoot(); + const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - let rootNode = new correction.SearchNode(rootTraversal); + const rootNode = new correction.SearchNode(rootTraversal, SEARCH_EDGE_SEED++); assert.equal(rootNode.calculation.getHeuristicFinalCost(), 0); - const subsetNodes = rootNode.buildSubstitutionEdges(synthDistribution); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildSubstitutionEdges(synthDistribution, subsetSeed); assert.equal(subsetNodes.length, 4); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); const expectedCosts = [0.5, .25, 0.15, 0.1].map(x => -Math.log(x)) @@ -607,6 +645,7 @@ describe('Correction Distance Modeler', () => { assert.isTrue(subsetNodes[i].hasPartialInput); // From root: the deleteLeft 1 entries have nothing to insert. assert.equal((subsetNodes[i].currentTraversal as TrieTraversal).prefix, ''); + assert.equal(subsetNodes[i].spaceId, subsetSeed); // Allow a little value wiggle due to double-precision limitations. assert.approximately(subsetNodes[i].inputSamplingCost, expectedCosts[i], 1e-8); @@ -617,15 +656,18 @@ describe('Correction Distance Modeler', () => { it('step 2: first processing layer resolves zero + one char inserts', () => { // From "step 1" above, assertions removed - let rootTraversal = testModel.traverseFromRoot(); - let rootNode = new correction.SearchNode(rootTraversal); - const subsetNodes = rootNode.buildSubstitutionEdges(synthDistribution); + const rootTraversal = testModel.traverseFromRoot(); + const rootNode = new correction.SearchNode(rootTraversal, SEARCH_EDGE_SEED++); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildSubstitutionEdges(synthDistribution, subsetSeed); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); + subsetNodes.forEach(n => assert.equal(n.spaceId, subsetSeed)); // Set 0: set for ins 1, dl 0 const ins1_dl0 = subsetNodes[0].processSubsetEdge(); // 3 transforms went in... but 1 ('x') had no lexical match. assert.equal(ins1_dl0.length, FIRST_CHAR_VARIANTS + 3 - 1); + ins1_dl0.forEach(n => assert.equal(n.spaceId, subsetSeed)); ins1_dl0.sort((a, b) => a.currentCost - b.currentCost); ins1_dl0.forEach(n => assert.isFalse(n.hasPartialInput)); // all fully-processed. @@ -673,6 +715,7 @@ describe('Correction Distance Modeler', () => { // No inserts, so no insert variants are possible. assert.equal(ins0_dl1.length, 1); + ins0_dl1.forEach(n => assert.equal(n.spaceId, subsetSeed)); assert.isFalse(ins0_dl1[0].hasPartialInput); // No insert string => no sentinel char to delete. @@ -691,6 +734,7 @@ describe('Correction Distance Modeler', () => { ins2_dl1.sort((a, b) => a.currentCost - b.currentCost); // only one char is processed at this stage. ins2_dl1.forEach(n => assert.isTrue(n.hasPartialInput)); + ins2_dl1.forEach(n => assert.equal(n.spaceId, subsetSeed)); assert.equal(ins2_dl1[0].calculation.lastInputEntry, 't'); assert.equal(ins2_dl1[0].calculation.lastMatchEntry, 't'); @@ -722,6 +766,7 @@ describe('Correction Distance Modeler', () => { ins2_dl0.sort((a, b) => a.currentCost - b.currentCost); // only one char is processed at this stage. ins2_dl0.forEach(n => assert.isTrue(n.hasPartialInput)); + ins2_dl0.forEach(n => assert.equal(n.spaceId, subsetSeed)); assert.equal(ins2_dl0[0].calculation.lastInputEntry, 'c'); assert.equal(ins2_dl0[0].calculation.lastMatchEntry, 'c'); @@ -745,9 +790,10 @@ describe('Correction Distance Modeler', () => { it('step 3: second processing layer resolves two char inserts', () => { // From "steps 0, 1" above, assertions removed - let rootTraversal = testModel.traverseFromRoot(); - let rootNode = new correction.SearchNode(rootTraversal); - const subsetNodes = rootNode.buildSubstitutionEdges(synthDistribution); + const rootTraversal = testModel.traverseFromRoot(); + const rootNode = new correction.SearchNode(rootTraversal, SEARCH_EDGE_SEED++); + const subsetSeed = SEARCH_EDGE_SEED++; + const subsetNodes = rootNode.buildSubstitutionEdges(synthDistribution, subsetSeed); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); @@ -762,6 +808,9 @@ describe('Correction Distance Modeler', () => { // All should be finished now! fin_in2_dl1.forEach(n => assert.isFalse(n.hasPartialInput)); + fin_in2_dl1.forEach(n => assert.equal(n.spaceId, subsetSeed)); + fin_in2_dl0.forEach(n => assert.equal(n.spaceId, subsetSeed)); + fin_in2_dl1.sort((a, b) => a.currentCost - b.currentCost); fin_in2_dl0.sort((a, b) => a.currentCost - b.currentCost); @@ -837,7 +886,7 @@ describe('Correction Distance Modeler', () => { const teNode = fetchCommonTENode(); assert.equal(teNode.calculation.getHeuristicFinalCost(), 0); - const subsetNodes = teNode.buildSubstitutionEdges(synthDistribution); + const subsetNodes = teNode.buildSubstitutionEdges(synthDistribution, SEARCH_EDGE_SEED++); assert.equal(subsetNodes.length, 4); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); const expectedCosts = [0.5, .25, 0.15, 0.1].map(x => -Math.log(x) + teNode.currentCost); @@ -857,7 +906,7 @@ describe('Correction Distance Modeler', () => { it('step 2: first processing layer resolves zero + one char inserts', () => { // From "step 1" above, assertions removed const teNode = fetchCommonTENode(); - const subsetNodes = teNode.buildSubstitutionEdges(synthDistribution); + const subsetNodes = teNode.buildSubstitutionEdges(synthDistribution, SEARCH_EDGE_SEED++); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); // Set 0: set for ins 1, dl 0 @@ -986,7 +1035,7 @@ describe('Correction Distance Modeler', () => { it('step 3: second processing layer resolves two char inserts', () => { // From "steps 0, 1" above, assertions removed const teNode = fetchCommonTENode(); - const subsetNodes = teNode.buildSubstitutionEdges(synthDistribution); + const subsetNodes = teNode.buildSubstitutionEdges(synthDistribution, SEARCH_EDGE_SEED++); subsetNodes.sort((a, b) => a.currentCost - b.currentCost); // ************ @@ -1054,60 +1103,67 @@ describe('Correction Distance Modeler', () => { }); }); - it('Small integration test: "teh" => "ten", "the"', function() { + it('Small integration test: "teh" => "ten", "the"', () => { // The combinatorial effect here is a bit much to fully test. - let rootTraversal = testModel.traverseFromRoot(); + const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - let rootNode = new correction.SearchNode(rootTraversal); + const rootSeed = SEARCH_EDGE_SEED++; + const rootNode = new correction.SearchNode(rootTraversal, rootSeed); assert.equal(rootNode.calculation.getHeuristicFinalCost(), 0); // VERY artificial distributions. - let synthDistribution1 = [ + const synthDistribution1 = [ {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability ]; - let synthDistribution2 = [ + const synthDistribution2 = [ {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} ]; - let synthDistribution3 = [ + const synthDistribution3 = [ {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} ]; - let layer1Edges = rootNode.buildSubstitutionEdges(synthDistribution1) + const layer1Id = SEARCH_EDGE_SEED++; + const layer1Edges = rootNode.buildSubstitutionEdges(synthDistribution1, layer1Id) // No 2+ inserts here; we're fine with just one call. .flatMap(e => e.processSubsetEdge()); const layer1Queue = new PriorityQueue(QUEUE_NODE_COMPARATOR, layer1Edges); - let tEdge = layer1Queue.dequeue(); + const tEdge = layer1Queue.dequeue(); assertEdgeChars(tEdge, 't', 't'); + assert.equal(tEdge.spaceId, layer1Id); // would be obtained by the token after one input. - let layer2Edges = tEdge.buildSubstitutionEdges(synthDistribution2) + const layer2Id = SEARCH_EDGE_SEED++; + const layer2Edges = tEdge.buildSubstitutionEdges(synthDistribution2, layer2Id) // No 2+ inserts here; we're fine with just one call. .flatMap(e => e.processSubsetEdge()); const layer2Queue = new PriorityQueue(QUEUE_NODE_COMPARATOR, layer2Edges); - let eEdge = layer2Queue.dequeue(); + const eEdge = layer2Queue.dequeue(); assertEdgeChars(eEdge, 'e', 'e'); + assert.equal(eEdge.spaceId, layer2Id); - let hEdge = layer2Queue.dequeue(); + const hEdge = layer2Queue.dequeue(); assertEdgeChars(hEdge, 'h', 'h'); + assert.equal(hEdge.spaceId, layer2Id); // Needed for a proper e <-> h transposition. - let ehEdge = findEdgesWithChars(layer2Edges, 'h')[0]; + const ehEdge = findEdgesWithChars(layer2Edges, 'h')[0]; assert.isOk(ehEdge); // Final round: we'll use three nodes and throw all of their results into the same priority queue. - let layer3eEdges = eEdge.buildSubstitutionEdges(synthDistribution3) + const layer3Id = SEARCH_EDGE_SEED++; + const layer3eEdges = eEdge.buildSubstitutionEdges(synthDistribution3, layer3Id) // No 2+ inserts here; we're fine with just one call. .flatMap(e => e.processSubsetEdge()); - let layer3hEdges = hEdge.buildSubstitutionEdges(synthDistribution3) + const layer3hEdges = hEdge.buildSubstitutionEdges(synthDistribution3, layer3Id) .flatMap(e => e.processSubsetEdge()); - let layer3ehEdges = ehEdge.buildSubstitutionEdges(synthDistribution3) + const layer3ehEdges = ehEdge.buildSubstitutionEdges(synthDistribution3, layer3Id) .flatMap(e => e.processSubsetEdge()); const layer3Queue = new PriorityQueue(QUEUE_NODE_COMPARATOR, layer3eEdges.concat(layer3hEdges).concat(layer3ehEdges)); @@ -1118,12 +1174,14 @@ describe('Correction Distance Modeler', () => { } while(bestEdge.currentTraversal.entries.length == 0); assertEdgeChars(bestEdge, 'n', 'n'); // 'ten' - perfect edit distance of 0, though less-likely input sequence. + assert.equal(bestEdge.spaceId, layer3Id); // No cost assumptions here. var sibling1; do { sibling1 = layer3Queue.dequeue(); } while(sibling1.currentTraversal.entries.length == 0); + assert.equal(sibling1.spaceId, layer3Id); // Both have a raw edit distance of 1 while using the same input-sequence root. ('th') let tenFlag = edgeHasChars(sibling1, SENTINEL_CODE_UNIT, 'n'); // subs out the 'h' entirely. Could also occur with 'a', but is too unlikely. @@ -1136,6 +1194,7 @@ describe('Correction Distance Modeler', () => { do { sibling2 = layer3Queue.dequeue(); } while(sibling2.currentTraversal.entries.length == 0); + assert.equal(sibling2.spaceId, layer3Id); tenFlag = tenFlag || edgeHasChars(sibling2, SENTINEL_CODE_UNIT, 'n'); theFlag = theFlag || edgeHasChars(sibling2, SENTINEL_CODE_UNIT, 'e'); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts index a661fee1284..acc5f0701a4 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-space.tests.ts @@ -10,7 +10,7 @@ import { assert } from 'chai'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; -import { correction, getBestMatches, models, SearchPath } from '@keymanapp/lm-worker/test-index'; +import { correction, getBestMatches, models, SearchCluster, SearchPath } from '@keymanapp/lm-worker/test-index'; import SearchResult = correction.SearchResult; import TrieModel = models.TrieModel; @@ -21,14 +21,119 @@ function buildTestTimer() { return new correction.ExecutionTimer(Number.MAX_VALUE, Number.MAX_VALUE); } -describe('SearchSpace', () => { - const checkRepeatableResults_teh = async (iter: AsyncGenerator) => { +const cantTestSetup = () => { + const rootSpace = new SearchCluster(testModel); + // The exact set of inputs and spaces here is a bit contrived, but the + // design does accommodate this setup. + + // 'c' space + const c1_t1_space = new SearchCluster([rootSpace.addInput([ + {sample: {insert: 'c', deleteLeft: 0}, p: 0.5} + ], 0.5)]); + const c_an_path = c1_t1_space.addInput([ + {sample: {insert: 'an', deleteLeft: 0}, p: 0.25} + ], 0.5); + const c2_t1_space = new SearchCluster([rootSpace.addInput([ + {sample: {insert: 'ca', deleteLeft: 0}, p: 0.5} + ], 0.25)]); + const ca_n_path = c2_t1_space.addInput([ + {sample: {insert: 'n', deleteLeft: 0}, p: 0.5} + ], 0.5); + + // 3 codepoints in 2 transforms. + // c + an + // ca + n + const c3_t2_space = new SearchCluster([c_an_path, ca_n_path]); + + // q for `'`, the single-Quote. + const c_an_q_path = c3_t2_space.addInput([ + {sample: {insert: "'", deleteLeft: 0}, p: 0.5} + ], 0.5); + + // c + a + const c_a_path = c1_t1_space.addInput([ + {sample: {insert: 'a', deleteLeft: 0}, p: 0.25} + ], 0.5); + + const c3_t1_space = new SearchCluster([rootSpace.addInput([ + {sample: {insert: 'cam', deleteLeft: 0}, p: 0.125}, + {sample: {insert: 'can', deleteLeft: 0}, p: 0.125} + ], 0.5)]); + + // [cam, can] + dl1 + const ca_dl1_path = c3_t1_space.addInput([ + {sample: {insert: '', deleteLeft: 1}, p: 0.25} + ], 0.5); + + const c2_t2_space = new SearchCluster([c_a_path, ca_dl1_path]); + + // c + a + n' + // [cam, can] + dl1 + n' + const c_a_nq_path = c2_t2_space.addInput([ + {sample: {insert: 'n\'', deleteLeft: 0}, p: 0.4} + ], 0.5); + + // [cam, can] + " + const c4_t2_space = new SearchCluster([c3_t1_space.addInput([ + {sample: {insert: '"', deleteLeft: 0}, p: 0.125}, + ], 0.5)]); + const can_q_dl1q_path = c4_t2_space.addInput([ + {sample: {insert: "'", deleteLeft: 1}, p: 0.125}, + ], 0.5); + + const c4_t3_space = new SearchCluster([c_an_q_path, c_a_nq_path, can_q_dl1q_path]); + const c_an_q_t_path = c4_t3_space.addInput([ + {sample: {insert: "t", deleteLeft: 0}, p: 0.9} + ], 0.9); + + // c + a + n + const c3_t3_space = new SearchCluster([c2_t2_space.addInput([ + {sample: {insert: "n", deleteLeft: 0}, p: 0.125}, + ], 0.5)]) + // c + a + n + 't + const c_a_n_qt_path = c3_t3_space.addInput([ + {sample: {insert: "'t", deleteLeft: 0}, p: 0.1}, + ], 0.9); + + // ALL above lead here: 5 chars (`can't`) in 4 transforms. + const c5_4t_space = new SearchCluster([c_an_q_t_path, c_a_n_qt_path]); + + return { + paths: { + c_a_path, + c_an_path, + ca_dl1_path, + ca_n_path, + c_an_q_path, + c_an_q_t_path, + c_a_nq_path, + can_q_dl1q_path, + c_a_n_qt_path + }, + spaces: { + rootSpace, + c1_t1_space, + c2_t1_space, + c2_t2_space, + c3_t1_space, + c3_t2_space, + c3_t3_space, + c4_t2_space, + c4_t3_space, + c5_4t_space + } + }; +} + +describe('SearchSpace + SearchPath', () => { + const checkRepeatableResults_teh = async (iter: AsyncGenerator, expectedSpaceId: number) => { const firstIterResult = await iter.next(); // {value: , done: } assert.isFalse(firstIterResult.done); const firstResult: correction.SearchResult = firstIterResult.value; // Retrieves // No checks on the first set's cost. assert.equal(firstResult.matchString, "ten"); + assert.equal(firstResult.spaceId, expectedSpaceId); // All start with 'te' but one, and invoke one edit of the same cost. // 'th' has an 'h' at the same cost (input 3) of the 'e' (input 2). @@ -51,6 +156,7 @@ describe('SearchSpace', () => { } else { cost = result.totalCost; } + assert.equal(result.spaceId, expectedSpaceId); const matchIndex = batch.findIndex((entry) => entry == result.matchString); assert.notEqual(matchIndex, -1, `'${result.matchString}' received as prediction too early`); @@ -96,9 +202,9 @@ describe('SearchSpace', () => { const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - const searchSpace = new SearchPath(testModel); + const searchSpace = new SearchCluster(testModel); - const iter = getBestMatches(searchSpace, buildTestTimer()); + const iter = getBestMatches([searchSpace], buildTestTimer()); const firstResult = await iter.next(); assert.isFalse(firstResult.done); }); @@ -109,7 +215,7 @@ describe('SearchSpace', () => { const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - let searchPath = new SearchPath(testModel); + const searchPath = new SearchPath(testModel); // VERY artificial distributions. const synthInput1 = [ @@ -126,12 +232,16 @@ describe('SearchSpace', () => { {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} ]; - searchPath = searchPath.addInput(synthInput1, 1); - searchPath = searchPath.addInput(synthInput2, .75); - searchPath = searchPath.addInput(synthInput3, .75); + const searchPath1 = new SearchPath(searchPath, synthInput1, 1); + const searchPath2 = new SearchPath(searchPath1, synthInput2, .75); + const searchPath3 = new SearchPath(searchPath2, synthInput3, .75); - const iter = getBestMatches(searchPath, buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter); + assert.notEqual(searchPath1.spaceId, searchPath.spaceId); + assert.notEqual(searchPath2.spaceId, searchPath1.spaceId); + assert.notEqual(searchPath3.spaceId, searchPath2.spaceId); + + const iter = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter, searchPath3.spaceId); }); it('Allows reiteration (sequentially)', async () => { @@ -139,7 +249,7 @@ describe('SearchSpace', () => { const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - let searchSpace = new SearchPath(testModel); + const searchPath = new SearchPath(testModel); // VERY artificial distributions. const synthInput1 = [ @@ -156,17 +266,21 @@ describe('SearchSpace', () => { {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} ]; - searchSpace = searchSpace.addInput(synthInput1, 1); - searchSpace = searchSpace.addInput(synthInput2, .75); - searchSpace = searchSpace.addInput(synthInput3, .75); + const searchPath1 = new SearchPath(searchPath, synthInput1, 1); + const searchPath2 = new SearchPath(searchPath1, synthInput2, .75); + const searchPath3 = new SearchPath(searchPath2, synthInput3, .75); + + assert.notEqual(searchPath1.spaceId, searchPath.spaceId); + assert.notEqual(searchPath2.spaceId, searchPath1.spaceId); + assert.notEqual(searchPath3.spaceId, searchPath2.spaceId); - const iter = getBestMatches(searchSpace, buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter); + const iter = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter, searchPath3.spaceId); // The key: do we get the same results the second time? // Reset the iterator first... - const iter2 = getBestMatches(searchSpace, buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter2); + const iter2 = getBestMatches([searchPath3], buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter2, searchPath3.spaceId); }); it('Empty search space, loaded model', async () => { @@ -174,9 +288,9 @@ describe('SearchSpace', () => { const rootTraversal = testModel.traverseFromRoot(); assert.isNotEmpty(rootTraversal); - const searchSpace = new SearchPath(testModel); + const searchSpace = new SearchCluster(testModel); const timer = buildTestTimer(); - const iter = getBestMatches(searchSpace, timer); + const iter = getBestMatches([searchSpace], timer); // While there's no input, insertion operations can produce suggestions. const resultState = await iter.next(); @@ -199,4 +313,303 @@ describe('SearchSpace', () => { assert.equal(laterResult.matchString, 't'); assert.isFalse(resultState.done); }); + + describe.skip('.inputSequences', () => { + it('define tests here', () => { + // + }); + }); + + describe("Splitting", () => { + describe("on SearchPath", () => { + it('handles simple cases built from two-char transforms', () => { + let root = new SearchPath(testModel); + let entries = ['ap', 'pl', 'es']; + + let path = root; + for(let entry of entries) { + path = new SearchCluster([path]).addInput([{sample: { insert: entry, deleteLeft: 0 }, p: .5}], .5); + } + + const splitPath = path.split(1, testModel); + assert.deepEqual(splitPath[0].bestExample, { + text: 'apple', + p: .125 + }); + assert.deepEqual(splitPath[1].bestExample, { + text: 's', + p: .5 + }); + }); + + it('splits inputs with delete properly when needed - char index 0', () => { + let root = new SearchPath(testModel); + let entries = ['ap', 'pl', 'y']; + + let path = root; + for(let entry of entries) { + path = new SearchCluster([path]).addInput([{sample: { insert: entry, deleteLeft: 0 }, p: .5}], .5); + } + + path = new SearchCluster([path]).addInput([{ + sample: { insert: 'es', deleteLeft: 1 }, p: .5 + }], .5) + + const splitPath = path.split(0, testModel); // do with a '1' too! + assert.deepEqual(splitPath[0].bestExample, { + text: 'appl', + p: .0625 + }); + assert.deepEqual(splitPath[1].bestExample, { + text: 'es', + p: .5 + }); + }); + + it('splits inputs with delete properly when needed - char index 1', () => { + let root = new SearchPath(testModel); + let entries = ['ap', 'pl', 'y']; + + let path = root; + for(let entry of entries) { + path = new SearchCluster([path]).addInput([{sample: { insert: entry, deleteLeft: 0 }, p: .5}], .5); + } + + path = new SearchCluster([path]).addInput([{ + sample: { insert: 'es', deleteLeft: 1 }, p: .5 + }], .5) + + const splitPath = path.split(1, testModel); // do with a '1' too! + assert.deepEqual(splitPath[0].bestExample, { + text: 'apple', + p: .0625 + }); + assert.deepEqual(splitPath[1].bestExample, { + text: 's', + p: .5 + }); + }); + }); + + describe('on SearchSpace', () => { + it('handles cases built solely from single-char transforms', () => { + let rootSpace = new SearchCluster(testModel); + const inputs = [...'apples']; + + let space = rootSpace; + let lSpace: SearchCluster; + for(const input of inputs) { + space = new SearchCluster([space.addInput([{sample: {insert: input, deleteLeft: 0}, p: 0.5}], .5)]); + + if(input == 'l') { + lSpace = space; + } + } + + const splitSpace = space.split(4, testModel); + assert.equal(splitSpace.length, 1); + const splitCase = splitSpace[0]; + assert.deepEqual(splitCase[0].bestExample, { + text: 'appl', + p: 0.0625 // .5 ^ 4 + }); + assert.deepEqual(splitCase[1].bestExample, { + text: 'es', + p: .25 + }); + + assert.equal(splitCase[0], lSpace); + }); + + it('handles cases built from two paths', () => { + let rootSpace = new SearchCluster(testModel); + + let space1 = rootSpace; + space1 = new SearchCluster([space1.addInput([{sample: {insert: 'al', deleteLeft: 0}, p: 0.5}], .5)]); + let path1 = space1.addInput([{sample: { insert: 'e', deleteLeft: 0}, p: 0.5}], .5); + + let space2 = rootSpace; + space2 = new SearchCluster([space2.addInput([{sample: {insert: 'a', deleteLeft: 0}, p: 0.5}], .5)]); + let path2 = space2.addInput([{sample: { insert: 'le', deleteLeft: 0}, p: 0.5}], .5); + + // Both end up with the same codepoint length, required the same number of inputs, and + // started from the same root. + let combinedSpace = new SearchCluster([path1, path2]); + + const splitSpace = combinedSpace.split(1, testModel); + assert.equal(splitSpace.length, 2); + assert.sameMembers(splitSpace.map(s => s[1].inputCount), [1, 2]); + + const cleanSplit = splitSpace.find(s => s[1].inputCount == 1); + assert.deepEqual(cleanSplit[0].bestExample, { + text: 'a', + p: 0.5 + }); + assert.equal(cleanSplit[0].inputCount, 1); + assert.deepEqual(cleanSplit[1].bestExample, { + text: 'le', + p: 0.5 + }); + + const trickySplit = splitSpace.find(s => s[1].inputCount == 2); + assert.deepEqual(trickySplit[0].bestExample, { + text: 'a', + p: 0.5 + }); + assert.equal(trickySplit[0].inputCount, 1); + assert.deepEqual(trickySplit[1].bestExample, { + text: 'le', + p: 0.25 + }); + assert.equal(trickySplit[1].inputCount, 2); + }); + + it('does not alter contents of original spaces or paths', () => { + let rootSpace = new SearchCluster(testModel); + + let space1 = rootSpace; + space1 = new SearchCluster([space1.addInput([{sample: {insert: 'al', deleteLeft: 0}, p: 0.5}], .5)]); + let path1 = space1.addInput([{sample: { insert: 'e', deleteLeft: 0}, p: 0.5}], .5); + + let space2 = rootSpace; + space2 = new SearchCluster([space2.addInput([{sample: {insert: 'a', deleteLeft: 0}, p: 0.5}], .5)]); + let path2 = space2.addInput([{sample: { insert: 'le', deleteLeft: 0}, p: 0.5}], .5); + + // Both end up with the same codepoint length, required the same number of inputs, and + // started from the same root. + let combinedSpace = new SearchCluster([path1, path2]); + + const space1InputSequences = space1.inputSequences.slice(); + const space2InputSequences = space2.inputSequences.slice(); + const combinedSpaceInputSequences = combinedSpace.inputSequences.slice(); + + const splitSpace = combinedSpace.split(1, testModel); + assert.equal(splitSpace.length, 2); + + assert.deepEqual(space1.inputSequences, space1InputSequences); + assert.deepEqual(space2.inputSequences, space2InputSequences); + assert.deepEqual(combinedSpace.inputSequences, combinedSpaceInputSequences); + }); + + describe(`complex setup: 5 codepoints in 4 transforms`, () => { + it('sets up test resources correctly', () => { + const cantFixtures = cantTestSetup(); + assert.isOk(cantFixtures); + + // Verify that all stages have the correct number of inputs. + assert.equal(cantFixtures.paths.c_a_path.inputCount, 2); + assert.equal(cantFixtures.paths.c_an_path.inputCount, 2); + assert.equal(cantFixtures.paths.ca_dl1_path.inputCount, 2); + assert.equal(cantFixtures.paths.ca_n_path.inputCount, 2); + + assert.equal(cantFixtures.paths.c_a_nq_path.inputCount, 3); + assert.equal(cantFixtures.paths.c_an_q_path.inputCount, 3); + assert.equal(cantFixtures.paths.can_q_dl1q_path.inputCount, 3); + + assert.equal(cantFixtures.paths.c_a_n_qt_path.inputCount, 4); + assert.equal(cantFixtures.paths.c_an_q_t_path.inputCount, 4); + + assert.equal(cantFixtures.spaces.rootSpace.inputCount, 0); + + assert.equal(cantFixtures.spaces.c1_t1_space.inputCount, 1); + assert.equal(cantFixtures.spaces.c2_t1_space.inputCount, 1); + assert.equal(cantFixtures.spaces.c3_t1_space.inputCount, 1); + + assert.equal(cantFixtures.spaces.c2_t2_space.inputCount, 2); + assert.equal(cantFixtures.spaces.c3_t2_space.inputCount, 2); + assert.equal(cantFixtures.spaces.c4_t2_space.inputCount, 2); + + assert.equal(cantFixtures.spaces.c3_t3_space.inputCount, 3); + assert.equal(cantFixtures.spaces.c4_t3_space.inputCount, 3); + + assert.equal(cantFixtures.spaces.c5_4t_space.inputCount, 4); + }); + + // do split-tests at each char position: 0 to 5. (Technically, only 1-4 should be used + // b/c word-internal, but... yeah.) + + it('splits correctly at index 0', () => { + const cantFixtures = cantTestSetup(); + const finalSpace = cantFixtures.spaces.c5_4t_space; + + const results = finalSpace.split(0, testModel); + + assert.isOk(results); + assert.equal(results.length, 1); + }); + + it('splits correctly at index 1', () => { + const cantFixtures = cantTestSetup(); + const finalSpace = cantFixtures.spaces.c5_4t_space; + + const results = finalSpace.split(1, testModel); + + assert.isOk(results); + // There exist 3 different spaces with a single transform, + // outputting 1, 2, and 3 codepoints respectively. + // + // The '2' and '3' variants must be split, but their paths + // recombine at the end. + // + // So, we get two: + // - One with a perfect split after the full first input + // - One with partial splits in the middle of that first input + assert.equal(results.length, 2); + }); + + it('splits correctly at index 2', () => { + const cantFixtures = cantTestSetup(); + const finalSpace = cantFixtures.spaces.c5_4t_space; + + const results = finalSpace.split(2, testModel); + + assert.isOk(results); + // Expected paths: + // - c + a, ca[mn] + dl1 (two transforms, no split) + // - c + a | n (two transforms with split at pos 1) + // - ca (single transform) + // - ca | [m, n] (single transform with split at pos 2) + assert.equal(results.length, 4); + }); + + it('splits correctly at index 3', () => { + const cantFixtures = cantTestSetup(); + const finalSpace = cantFixtures.spaces.c5_4t_space; + + const results = finalSpace.split(3, testModel); + + assert.isOk(results); + // There exist 3 variants of spaces with 3 codepoints: + // 1, 2, and 3 transforms. + // - ca[m|n] + // - c + a + n + // - ca + n, c + an + // There's also a c + a + n' case, where the `n'` must be split. + assert.equal(results.length, 4); + }); + + it('splits correctly at index 4', () => { + const cantFixtures = cantTestSetup(); + const finalSpace = cantFixtures.spaces.c5_4t_space; + + const results = finalSpace.split(4, testModel); + + assert.isOk(results); + // Some have just + t after the end of the head space + // Some split the final transform: `'t`. + assert.equal(results.length, 2); + }); + + it('splits correctly at index 5', () => { + const cantFixtures = cantTestSetup(); + const finalSpace = cantFixtures.spaces.c5_4t_space; + + const results = finalSpace.split(5, testModel); + + assert.isOk(results); + assert.equal(results.length, 1); + }); + }); + }); + }); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts index f4574af195f..be81e711610 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-alignment.tests.ts @@ -48,7 +48,7 @@ describe('determineSuggestionAlignment', () => { transition.finalize(transition.base, [{sample: { insert: '', deleteLeft: 0 }, p: 1}]); // transition, model - const results = determineSuggestionAlignment(transition, plainCasedModel); + const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel); assert.deepEqual(results.predictionContext, context); assert.equal(results.deleteLeft, "techn".length); @@ -65,7 +65,7 @@ describe('determineSuggestionAlignment', () => { const transition = baseState.analyzeTransition(context, [{sample: { insert: '', deleteLeft: 1 }, p: 1}]) // transition, model - const results = determineSuggestionAlignment(transition, plainCasedModel); + const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel); assert.deepEqual(results.predictionContext, context); assert.equal(results.deleteLeft, "tech".length + 1 /* for the deleted whitespace */); @@ -82,7 +82,7 @@ describe('determineSuggestionAlignment', () => { const transition = baseState.analyzeTransition(context, [{sample: { insert: 'n', deleteLeft: 1 }, p: 1}]) // transition, model - const results = determineSuggestionAlignment(transition, plainCasedModel); + const results = determineSuggestionAlignment(transition, transition.final.tokenization, plainCasedModel); assert.deepEqual(results.predictionContext, context); assert.equal(results.deleteLeft, "techn".length + 1 /* for the deleted whitespace */); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts index e5c511adb76..1008699b3ac 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/determine-suggestion-context-transition.tests.ts @@ -108,7 +108,7 @@ describe('determineContextTransition', () => { assert.equal(transition.final.context.left, targetContext.left); assert.equal(transition.final.context.right ?? "", targetContext.right ?? ""); assert.sameDeepOrderedMembers(transition.inputDistribution, inputDistribution); - assert.isNotOk(transition.preservationTransform); + assert.isNotOk(transition.final.tokenization.taillessTrueKeystroke); assert.equal(transition.transitionId, 1); } finally { warningEmitterSpy.restore();