diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index 01d1a7a1935..efccd00cc46 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -11,7 +11,7 @@ import { applyTransform, buildMergedTransform } from "@keymanapp/models-template import { LexicalModelTypes } from '@keymanapp/common-types'; import { deepCopy, KMWString } from "@keymanapp/web-utils"; -import { SearchSpace } from "./distance-modeler.js"; +import { SearchQuotientSpur } from "./search-quotient-spur.js"; import { TokenSplitMap } from "./context-tokenization.js"; import Distribution = LexicalModelTypes.Distribution; @@ -58,7 +58,7 @@ export class ContextToken { * Contains all relevant correction-search data for use in generating * corrections for this ContextToken instance. */ - readonly searchSpace: SearchSpace; + readonly searchSpace: SearchQuotientSpur; isPartial: boolean; @@ -104,7 +104,7 @@ export class ContextToken { // // In case we are unable to perfectly track context (say, due to multitaps) // we need to ensure that only fully-utilized keystrokes are considered. - this.searchSpace = new SearchSpace(priorToken.searchSpace); + this.searchSpace = new SearchQuotientSpur(priorToken.searchSpace); this._inputRange = priorToken._inputRange.slice(); // Preserve any annotated applied-suggestion transition ID data; it's useful @@ -118,7 +118,7 @@ export class ContextToken { // May be altered outside of the constructor. this.isWhitespace = false; this.isPartial = !!isPartial; - this.searchSpace = new SearchSpace(model); + this.searchSpace = new SearchQuotientSpur(model); this._inputRange = []; rawText ||= ''; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts index 2e5655bb287..cf432c4f643 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/distance-modeler.ts @@ -1,14 +1,14 @@ import { SENTINEL_CODE_UNIT } from '@keymanapp/models-templates'; -import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils'; +import { PriorityQueue } from '@keymanapp/web-utils'; import { LexicalModelTypes } from '@keymanapp/common-types'; import { ClassicalDistanceCalculation } from './classical-calculation.js'; import { ExecutionTimer, STANDARD_TIME_BETWEEN_DEFERS } from './execution-timer.js'; +import { QUEUE_NODE_COMPARATOR, SearchQuotientSpur } from './search-quotient-spur.js'; import { subsetByChar, subsetByInterval, mergeSubset, TransformSubset } from '../transform-subsets.js'; import Distribution = LexicalModelTypes.Distribution; -import LexicalModel = LexicalModelTypes.LexicalModel; import LexiconTraversal = LexicalModelTypes.LexiconTraversal; import ProbabilityMass = LexicalModelTypes.ProbabilityMass; import Transform = LexicalModelTypes.Transform; @@ -20,10 +20,6 @@ export type TraversableToken = { traversal: LexiconTraversal } -export const QUEUE_NODE_COMPARATOR: Comparator = function(arg1, arg2) { - return arg1.currentCost - arg2.currentCost; -} - enum TimedTaskTypes { CACHED_RESULT = 0, PREDICTING = 1, @@ -184,7 +180,7 @@ export class SearchNode { if(this._inputCost !== undefined) { return this._inputCost; } else { - let MIN_P = SearchSpace.MIN_KEYSTROKE_PROBABILITY; + let MIN_P = SearchQuotientSpur.MIN_KEYSTROKE_PROBABILITY; // Should technically re-normalize the sampling distribution. // -ln(p) is smaller for larger probabilities, as ln(p) is always <= 0. Approaches infinity as p => 0. @@ -218,7 +214,7 @@ export class SearchNode { // p = 1 / (e^4) = 0.01831563888. This still exceeds many neighboring keys! // p = 1 / (e^5) = 0.00673794699. Strikes a good balance. // Should easily give priority to neighboring keys before edit-distance kicks in (when keys are a bit ambiguous) - return SearchSpace.EDIT_DISTANCE_COST_SCALE * this.editCount + this.inputSamplingCost; + return SearchQuotientSpur.EDIT_DISTANCE_COST_SCALE * this.editCount + this.inputSamplingCost; } /** @@ -612,378 +608,94 @@ type CompleteSearchPath = { finalNode: SearchNode } -type PathResult = NullPath | IntermediateSearchPath | CompleteSearchPath; - -// The set of search spaces corresponding to the same 'context' for search. -// Whenever a wordbreak boundary is crossed, a new instance should be made. -export class SearchSpace { - // p = 1 / (e^4) = 0.01831563888. This still exceeds many neighboring keys! - // p = 1 / (e^5) = 0.00673794699. Strikes a good balance. - // Should easily give priority to neighboring keys before edit-distance kicks in (when keys are a bit ambiguous) - static readonly EDIT_DISTANCE_COST_SCALE = 5; - static readonly MIN_KEYSTROKE_PROBABILITY = 0.0001; - static readonly DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL = 33; // in milliseconds. - - private selectionQueue: PriorityQueue; - private _inputSequence: Distribution[] = []; - private minInputCost: number[] = []; - private rootNode: SearchNode; - - // We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time - // is cheaper than iteratively building a priority queue. - /** - * This tracks all paths that have reached the end of a viable input-matching path - even - * those of lower cost that produce the same correction as other paths. - * - * When new input is received, its entries are then used to append edges to the path in order - * to find potential paths to reach a new viable end. - */ - private completedPaths: SearchNode[]; - - /** - * Marks all results that have already been returned since the last input was received. - * Is cleared after .addInput() calls. - */ - private returnedValues: {[resultKey: string]: SearchNode} = {}; - - /** - * Acts as a Map that prevents duplicating a correction-search path if reached - * more than once. - */ - private processedEdgeSet: {[pathKey: string]: boolean} = {}; - - /** - * Provides a heuristic for the base cost at each depth if the best - * individual input were taken at that level. - */ - private lowestCostAtDepth: number[]; - - /** - * Clone constructor. Deep-copies its internal queues, but not search nodes. - * @param instance - */ - constructor(instance: SearchSpace); - /** - * Constructs a fresh SearchSpace instance for used in predictive-text correction - * and suggestion searches. - * @param model - */ - constructor(model: LexicalModel); - constructor(arg1: SearchSpace|LexicalModel) { - if(arg1 instanceof SearchSpace) { - this._inputSequence = [].concat(arg1._inputSequence); - this.minInputCost = [].concat(arg1.minInputCost); - this.rootNode = arg1.rootNode; - // Re-use already-checked Nodes. - this.completedPaths = [].concat(arg1.completedPaths); - this.lowestCostAtDepth = arg1.lowestCostAtDepth.slice(); - this.returnedValues = {...arg1.returnedValues}; - this.processedEdgeSet = {...arg1.processedEdgeSet}; - - this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR); - this.selectionQueue.enqueueAll([...arg1.selectionQueue.toArray()]); - return; - } - - const model = arg1; - if(!model) { - throw new Error("The LexicalModel parameter must not be null / undefined."); - } else if(!model.traverseFromRoot) { - throw new Error("The provided model does not implement the `traverseFromRoot` function, which is needed to support robust correction searching."); - } - - this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR); - this.rootNode = new SearchNode(model.traverseFromRoot(), model.toKey ? model.toKey.bind(model) : null); - this.selectionQueue.enqueue(this.rootNode); - this.lowestCostAtDepth = []; - - this.completedPaths = []; - } - - /** - * Retrieves the sequence of inputs - */ - public get inputSequence() { - return [...this._inputSequence]; - } - - increaseMaxEditDistance() { - // By extracting the entries from the priority queue and increasing distance outside of it as a batch job, - // we get an O(N) implementation, rather than the O(N log N) that would result from maintaining the original queue. - const entries = this.selectionQueue.toArray(); - - entries.forEach(function(edge) { edge.calculation = edge.calculation.increaseMaxDistance(); }); - - // Since we just modified the stored instances, and the costs may have shifted, we need to re-heapify. - this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, entries); - } - - get correctionsEnabled() { - // When corrections are disabled, the Web engine will only provide individual Transforms - // for an input, not a distribution. No distributions means we shouldn't do corrections. - return !!this._inputSequence.find((distribution) => distribution.length > 1); - } - - /** - * Extends the correction-search process embodied by this SearchSpace by an extra - * input character, according to the characters' likelihood in the distribution. - * @param inputDistribution The fat-finger distribution for the incoming keystroke (or - * just the raw keystroke if corrections are disabled) - */ - addInput(inputDistribution: Distribution, bestProbFromSet: number) { - const input = inputDistribution; - this._inputSequence.push(input); - const lastDepthCost = this.lowestCostAtDepth[this.lowestCostAtDepth.length - 1] ?? 0; - const logTierCost = -Math.log(bestProbFromSet); - this.lowestCostAtDepth.push(lastDepthCost + logTierCost); - - // Assumes that `inputDistribution` is already sorted. - this.minInputCost.push(-Math.log(inputDistribution[0].p)); - - // With a newly-available input, we can extend new input-dependent paths from - // our previously-reached 'extractedResults' nodes. - let newlyAvailableEdges: SearchNode[] = []; - let batches = this.completedPaths.map(function(node) { - let deletions = node.buildDeletionEdges(inputDistribution); - let substitutions = node.buildSubstitutionEdges(inputDistribution); - - const batch = deletions.concat(substitutions); - - // Skip the queue for the first pass; there will ALWAYS be at least one pass, - // and queue-enqueing does come with a cost. Avoid the unnecessary overhead. - return batch.flatMap(e => e.processSubsetEdge()); - }); - - // Don't forget to reset the array; the contained nodes no longer reach the search's end. - this.completedPaths = []; - this.returnedValues = {}; - - batches.forEach(function(batch) { - newlyAvailableEdges = newlyAvailableEdges.concat(batch); - }); - - this.selectionQueue.enqueueAll(newlyAvailableEdges); - } - - // TODO: will want eventually for reversions and/or backspaces - removeLastInput() { - // 1. truncate all entries from that search tier; we need to 'restore' extractedResults to match - // the state that would have existed without the last search tier. - // 2. remove the last search tier. Which may necessitate reconstructing the tier queue, but oh well. - } +export type PathResult = NullPath | IntermediateSearchPath | CompleteSearchPath; - /** - * Indicates if the correction-search has another entry (and thus has not yet - * reached its end). - * @returns - */ - private hasNextMatchEntry(): boolean { - return this.selectionQueue.count > 0 && this.selectionQueue.peek().currentCost < Number.POSITIVE_INFINITY; - } +// Current best guesstimate of how compositor will retrieve ideal corrections. +export async function *getBestMatches(searchSpace: SearchQuotientSpur, timer: ExecutionTimer): AsyncGenerator { + let currentReturns: {[resultKey: string]: SearchNode} = {}; - public getCurrentCost(): number { - if(this.selectionQueue.count > 0) { - return this.selectionQueue.peek().currentCost; - } + // Stage 1 - if we already have extracted results, build a queue just for them and iterate over it first. + const returnedValues = Object.values(searchSpace.returnedValues); + if(returnedValues.length > 0) { + let preprocessedQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, returnedValues); - return Number.POSITIVE_INFINITY; - } + while(preprocessedQueue.count > 0) { + const entryFromCache = timer.time(() => { + let entry = preprocessedQueue.dequeue(); - /** - * Retrieves the lowest-cost / lowest-distance edge from the selection queue, - * checks its validity as a correction to the input text, and reports on what - * sort of result the edge's destination node represents. - * @returns - */ - private handleNextNode(): PathResult { - if(!this.hasNextMatchEntry()) { - return { type: 'none' }; - } - - let currentNode = this.selectionQueue.dequeue(); - - let unmatchedResult: IntermediateSearchPath = { - type: 'intermediate', - cost: currentNode.currentCost - } - - // Have we already processed a matching edge? If so, skip it. - // We already know the previous edge is of lower cost. - if(this.processedEdgeSet[currentNode.pathKey]) { - return unmatchedResult; - } else { - this.processedEdgeSet[currentNode.pathKey] = true; - } - - // Stage 1: filter out nodes/edges we want to prune - - // Forbid a raw edit-distance of greater than 2. - // Note: .knownCost is not scaled, while its contribution to .currentCost _is_ scaled. - let substitutionsOnly = false; - if(currentNode.editCount > 2) { - return unmatchedResult; - } else if(currentNode.editCount == 2) { - // Hard restriction: no further edits will be supported. This helps keep the search - // more narrowly focused. - substitutionsOnly = true; - } - - // Thresholds _any_ path, partially based on currently-traversed distance. - // Allows a little 'wiggle room' + 2 "hard" edits. - // Can be important if needed characters don't actually exist on the keyboard - // ... or even just not the then-current layer of the keyboard. - // - // TODO: still consider the lowest-cost individual edges for THIS specific criterion. - const tierMinCost = this.lowestCostAtDepth[currentNode.priorInput.length-1]; - if(currentNode.currentCost > tierMinCost + 2.5 * SearchSpace.EDIT_DISTANCE_COST_SCALE) { - return unmatchedResult; - } - - // Stage 2: process subset further OR build remaining edges - - if(currentNode.hasPartialInput) { - // Re-use the current queue; the number of total inputs considered still holds. - this.selectionQueue.enqueueAll(currentNode.processSubsetEdge()); - return unmatchedResult; - } - - // OK, we fully crossed a graph edge and have landed on a transition point; - // time to build more edges / edge batches. - - // Always possible, as this does not require any new input. - if(!substitutionsOnly) { - let insertionEdges = currentNode.buildInsertionEdges(); - this.selectionQueue.enqueueAll(insertionEdges); - } - - if(currentNode.calculation.inputSequence.length == this.inputSequence.length) { - // It was the final tier - store the node for future reference. - this.completedPaths.push(currentNode); + // Is the entry a reasonable result? + if(entry.isFullReplacement) { + // If the entry's 'match' fully replaces the input string, we consider it + // unreasonable and ignore it. + return null; + } - if((this.returnedValues[currentNode.resultKey]?.currentCost ?? Number.POSITIVE_INFINITY) > currentNode.currentCost) { - this.returnedValues[currentNode.resultKey] = currentNode; - } else { - // Not a better cost, so reject it and move on to the next potential result. - return this.handleNextNode(); - } + currentReturns[entry.resultKey] = entry; + // Do not track yielded time. + return new SearchResult(entry); + }, TimedTaskTypes.CACHED_RESULT); - return { - type: 'complete', - cost: currentNode.currentCost, - finalNode: currentNode - }; - } else { - // Time to construct new edges for the next tier! - let inputIndex = currentNode.calculation.inputSequence.length; + if(entryFromCache) { + // Time yielded here is generally spent on turning corrections into predictions. + // It's timing a different sort of task, so... different task set ID. + const timeSpan = timer.start(TimedTaskTypes.PREDICTING); + yield entryFromCache; + timeSpan.end(); - let deletionEdges: SearchNode[] = []; - if(!substitutionsOnly) { - deletionEdges = currentNode.buildDeletionEdges(this._inputSequence[inputIndex]); + if(timer.timeSinceLastDefer > STANDARD_TIME_BETWEEN_DEFERS) { + await timer.defer(); + } } - const substitutionEdges = currentNode.buildSubstitutionEdges(this._inputSequence[inputIndex]); - let batch = deletionEdges.concat(substitutionEdges); - - // Skip the queue for the first pass; there will ALWAYS be at least one pass, - // and queue-enqueing does come with a cost - avoid unnecessary overhead here. - batch = batch.flatMap(e => e.processSubsetEdge()); - - // Note: we're live-modifying the tier's cost here! The priority queue loses its guarantees as a result. - this.selectionQueue.enqueueAll(batch); - - // We didn't reach an end-node, so we just end the iteration and continue the search. } - - // If we've somehow fully exhausted all search options, indicate that none remain. - return unmatchedResult; } - // Current best guesstimate of how compositor will retrieve ideal corrections. - async *getBestMatches(timer: ExecutionTimer): AsyncGenerator { - let currentReturns: {[resultKey: string]: SearchNode} = {}; + // Stage 2: the fun part; actually searching! + do { + const entry = timer.time(() => { + let newResult: PathResult = searchSpace.handleNextNode(); - // Stage 1 - if we already have extracted results, build a queue just for them and iterate over it first. - let returnedValues = Object.values(this.returnedValues); - if(returnedValues.length > 0) { - let preprocessedQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, returnedValues); - - while(preprocessedQueue.count > 0) { - const entryFromCache = timer.time(() => { - let entry = preprocessedQueue.dequeue(); + if(newResult.type == 'none') { + return null; + } else if(newResult.type == 'complete') { + const node = newResult.finalNode; + + // Is the entry a reasonable result? + if(node.isFullReplacement) { + // If the entry's 'match' fully replaces the input string, we consider it + // unreasonable and ignore it. Also, if we've reached this point... + // we can(?) assume that everything thereafter is as well. + return null; + } - // Is the entry a reasonable result? - if(entry.isFullReplacement) { - // If the entry's 'match' fully replaces the input string, we consider it - // unreasonable and ignore it. - return null; - } + const entry = newResult.finalNode; + // As we can't guarantee a monotonically-increasing cost during the search - + // due to effects from keystrokes with deleteLeft > 0 - it's technically + // possible to find a lower-cost path later in such cases. + // + // If it occurs, we should re-emit it - it'll show up earlier in the + // suggestions that way, as it should. + if((currentReturns[entry.resultKey]?.currentCost ?? Number.MAX_VALUE) > entry.currentCost) { currentReturns[entry.resultKey] = entry; + searchSpace.returnedValues[entry.resultKey] = entry; // Do not track yielded time. return new SearchResult(entry); - }, TimedTaskTypes.CACHED_RESULT); - - if(entryFromCache) { - // Time yielded here is generally spent on turning corrections into predictions. - // It's timing a different sort of task, so... different task set ID. - const timeSpan = timer.start(TimedTaskTypes.PREDICTING); - yield entryFromCache; - timeSpan.end(); - - if(timer.timeSinceLastDefer > STANDARD_TIME_BETWEEN_DEFERS) { - await timer.defer(); - } } } - } - // Stage 2: the fun part; actually searching! - do { - const entry = timer.time(() => { - let newResult: PathResult = this.handleNextNode(); + return null; + }, TimedTaskTypes.CORRECTING); - if(newResult.type == 'none') { - return null; - } else if(newResult.type == 'complete') { - const node = newResult.finalNode; - - // Is the entry a reasonable result? - if(node.isFullReplacement) { - // If the entry's 'match' fully replaces the input string, we consider it - // unreasonable and ignore it. Also, if we've reached this point... - // we can(?) assume that everything thereafter is as well. - return null; - } - - const entry = newResult.finalNode; - - // As we can't guarantee a monotonically-increasing cost during the search - - // due to effects from keystrokes with deleteLeft > 0 - it's technically - // possible to find a lower-cost path later in such cases. - // - // If it occurs, we should re-emit it - it'll show up earlier in the - // suggestions that way, as it should. - if((currentReturns[entry.resultKey]?.currentCost ?? Number.MAX_VALUE) > entry.currentCost) { - currentReturns[entry.resultKey] = entry; - this.returnedValues[entry.resultKey] = entry; - // Do not track yielded time. - return new SearchResult(entry); - } - } - - return null; - }, TimedTaskTypes.CORRECTING); - - if(entry) { - const timeSpan = timer.start(TimedTaskTypes.PREDICTING); - yield entry; - timeSpan.end(); - } + if(entry) { + const timeSpan = timer.start(TimedTaskTypes.PREDICTING); + yield entry; + timeSpan.end(); + } - if(timer.timeSinceLastDefer > STANDARD_TIME_BETWEEN_DEFERS) { - await timer.defer(); - } - } while(!timer.elapsed && this.hasNextMatchEntry()); + if(timer.timeSinceLastDefer > STANDARD_TIME_BETWEEN_DEFERS) { + await timer.defer(); + } + } while(!timer.elapsed && searchSpace.hasNextMatchEntry()); - return null; - } + return null; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts new file mode 100644 index 00000000000..dda414f33b4 --- /dev/null +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts @@ -0,0 +1,306 @@ +/* + * Keyman is copyright (C) SIL Global. MIT License. + * + * Created by jahorton on 2025-10-09 + * + * This file defines tests for the predictive-text engine's SearchSpace class, + * which is used to manage the search-space(s) for text corrections within the + * engine. + */ + +import { QueueComparator as Comparator, PriorityQueue } from '@keymanapp/web-utils'; +import { LexicalModelTypes } from '@keymanapp/common-types'; + +import { PathResult, SearchNode } from './distance-modeler.js'; + +import Distribution = LexicalModelTypes.Distribution; +import LexicalModel = LexicalModelTypes.LexicalModel; +import Transform = LexicalModelTypes.Transform; + +export const QUEUE_NODE_COMPARATOR: Comparator = function(arg1, arg2) { + return arg1.currentCost - arg2.currentCost; +} + +// The set of search spaces corresponding to the same 'context' for search. +// Whenever a wordbreak boundary is crossed, a new instance should be made. +export class SearchQuotientSpur { + // p = 1 / (e^4) = 0.01831563888. This still exceeds many neighboring keys! + // p = 1 / (e^5) = 0.00673794699. Strikes a good balance. + // Should easily give priority to neighboring keys before edit-distance kicks in (when keys are a bit ambiguous) + static readonly EDIT_DISTANCE_COST_SCALE = 5; + static readonly MIN_KEYSTROKE_PROBABILITY = 0.0001; + static readonly DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL = 33; // in milliseconds. + + private selectionQueue: PriorityQueue; + private _inputSequence: Distribution[] = []; + private minInputCost: number[] = []; + private rootNode: SearchNode; + + // We use an array and not a PriorityQueue b/c batch-heapifying at a single point in time + // is cheaper than iteratively building a priority queue. + /** + * This tracks all paths that have reached the end of a viable input-matching path - even + * those of lower cost that produce the same correction as other paths. + * + * When new input is received, its entries are then used to append edges to the path in order + * to find potential paths to reach a new viable end. + */ + private completedPaths: SearchNode[]; + + /** + * Marks all results that have already been returned since the last input was received. + * Is cleared after .addInput() calls. + */ + public returnedValues: {[resultKey: string]: SearchNode} = {}; // TODO: make it private again! + + /** + * Acts as a Map that prevents duplicating a correction-search path if reached + * more than once. + */ + private processedEdgeSet: {[pathKey: string]: boolean} = {}; + + /** + * Provides a heuristic for the base cost at each depth if the best + * individual input were taken at that level. + */ + private lowestCostAtDepth: number[]; + + /** + * Clone constructor. Deep-copies its internal queues, but not search nodes. + * @param instance + */ + constructor(instance: SearchQuotientSpur); + /** + * Constructs a fresh SearchSpace instance for used in predictive-text correction + * and suggestion searches. + * @param model + */ + constructor(model: LexicalModel); + constructor(arg1: SearchQuotientSpur|LexicalModel) { + if(arg1 instanceof SearchQuotientSpur) { + this._inputSequence = [].concat(arg1._inputSequence); + this.minInputCost = [].concat(arg1.minInputCost); + this.rootNode = arg1.rootNode; + // Re-use already-checked Nodes. + this.completedPaths = [].concat(arg1.completedPaths); + this.lowestCostAtDepth = arg1.lowestCostAtDepth.slice(); + this.returnedValues = {...arg1.returnedValues}; + this.processedEdgeSet = {...arg1.processedEdgeSet}; + + this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR); + this.selectionQueue.enqueueAll([...arg1.selectionQueue.toArray()]); + return; + } + + const model = arg1; + if(!model) { + throw new Error("The LexicalModel parameter must not be null / undefined."); + } else if(!model.traverseFromRoot) { + throw new Error("The provided model does not implement the `traverseFromRoot` function, which is needed to support robust correction searching."); + } + + this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR); + this.rootNode = new SearchNode(model.traverseFromRoot(), model.toKey ? model.toKey.bind(model) : null); + this.selectionQueue.enqueue(this.rootNode); + this.lowestCostAtDepth = []; + + this.completedPaths = []; + } + + /** + * Retrieves the sequence of inputs + */ + public get inputSequence() { + return [...this._inputSequence]; + } + + increaseMaxEditDistance() { + // By extracting the entries from the priority queue and increasing distance outside of it as a batch job, + // we get an O(N) implementation, rather than the O(N log N) that would result from maintaining the original queue. + const entries = this.selectionQueue.toArray(); + + entries.forEach(function(edge) { edge.calculation = edge.calculation.increaseMaxDistance(); }); + + // Since we just modified the stored instances, and the costs may have shifted, we need to re-heapify. + this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, entries); + } + + get correctionsEnabled() { + // When corrections are disabled, the Web engine will only provide individual Transforms + // for an input, not a distribution. No distributions means we shouldn't do corrections. + return !!this._inputSequence.find((distribution) => distribution.length > 1); + } + + /** + * Extends the correction-search process embodied by this SearchSpace by an extra + * input character, according to the characters' likelihood in the distribution. + * @param inputDistribution The fat-finger distribution for the incoming keystroke (or + * just the raw keystroke if corrections are disabled) + */ + addInput(inputDistribution: Distribution, bestProbFromSet: number) { + const input = inputDistribution; + this._inputSequence.push(input); + const lastDepthCost = this.lowestCostAtDepth[this.lowestCostAtDepth.length - 1] ?? 0; + const logTierCost = -Math.log(bestProbFromSet); + this.lowestCostAtDepth.push(lastDepthCost + logTierCost); + + // Assumes that `inputDistribution` is already sorted. + this.minInputCost.push(-Math.log(inputDistribution[0].p)); + + // With a newly-available input, we can extend new input-dependent paths from + // our previously-reached 'extractedResults' nodes. + let newlyAvailableEdges: SearchNode[] = []; + let batches = this.completedPaths.map(function(node) { + let deletions = node.buildDeletionEdges(inputDistribution); + let substitutions = node.buildSubstitutionEdges(inputDistribution); + + const batch = deletions.concat(substitutions); + + // Skip the queue for the first pass; there will ALWAYS be at least one pass, + // and queue-enqueing does come with a cost. Avoid the unnecessary overhead. + return batch.flatMap(e => e.processSubsetEdge()); + }); + + // Don't forget to reset the array; the contained nodes no longer reach the search's end. + this.completedPaths = []; + this.returnedValues = {}; + + batches.forEach(function(batch) { + newlyAvailableEdges = newlyAvailableEdges.concat(batch); + }); + + this.selectionQueue.enqueueAll(newlyAvailableEdges); + } + + // TODO: will want eventually for reversions and/or backspaces + removeLastInput() { + // 1. truncate all entries from that search tier; we need to 'restore' extractedResults to match + // the state that would have existed without the last search tier. + // 2. remove the last search tier. Which may necessitate reconstructing the tier queue, but oh well. + } + + /** + * Indicates if the correction-search has another entry (and thus has not yet + * reached its end). + * @returns + */ + public hasNextMatchEntry(): boolean { + return this.selectionQueue.count > 0 && this.selectionQueue.peek().currentCost < Number.POSITIVE_INFINITY; + } + + public getCurrentCost(): number { + if(this.selectionQueue.count > 0) { + return this.selectionQueue.peek().currentCost; + } + + return Number.POSITIVE_INFINITY; + } + + /** + * Retrieves the lowest-cost / lowest-distance edge from the selection queue, + * checks its validity as a correction to the input text, and reports on what + * sort of result the edge's destination node represents. + * @returns + */ + handleNextNode(): PathResult { + if(!this.hasNextMatchEntry()) { + return { type: 'none' }; + } + + let currentNode = this.selectionQueue.dequeue(); + + let unmatchedResult: PathResult = { + type: 'intermediate', + cost: currentNode.currentCost + } + + // Have we already processed a matching edge? If so, skip it. + // We already know the previous edge is of lower cost. + if(this.processedEdgeSet[currentNode.pathKey]) { + return unmatchedResult; + } else { + this.processedEdgeSet[currentNode.pathKey] = true; + } + + // Stage 1: filter out nodes/edges we want to prune + + // Forbid a raw edit-distance of greater than 2. + // Note: .knownCost is not scaled, while its contribution to .currentCost _is_ scaled. + let substitutionsOnly = false; + if(currentNode.editCount > 2) { + return unmatchedResult; + } else if(currentNode.editCount == 2) { + // Hard restriction: no further edits will be supported. This helps keep the search + // more narrowly focused. + substitutionsOnly = true; + } + + // Thresholds _any_ path, partially based on currently-traversed distance. + // Allows a little 'wiggle room' + 2 "hard" edits. + // Can be important if needed characters don't actually exist on the keyboard + // ... or even just not the then-current layer of the keyboard. + // + // TODO: still consider the lowest-cost individual edges for THIS specific criterion. + const tierMinCost = this.lowestCostAtDepth[currentNode.priorInput.length-1]; + if(currentNode.currentCost > tierMinCost + 2.5 * SearchQuotientSpur.EDIT_DISTANCE_COST_SCALE) { + return unmatchedResult; + } + + // Stage 2: process subset further OR build remaining edges + + if(currentNode.hasPartialInput) { + // Re-use the current queue; the number of total inputs considered still holds. + this.selectionQueue.enqueueAll(currentNode.processSubsetEdge()); + return unmatchedResult; + } + + // OK, we fully crossed a graph edge and have landed on a transition point; + // time to build more edges / edge batches. + + // Always possible, as this does not require any new input. + if(!substitutionsOnly) { + let insertionEdges = currentNode.buildInsertionEdges(); + this.selectionQueue.enqueueAll(insertionEdges); + } + + if(currentNode.calculation.inputSequence.length == this.inputSequence.length) { + // It was the final tier - store the node for future reference. + this.completedPaths.push(currentNode); + + if((this.returnedValues[currentNode.resultKey]?.currentCost ?? Number.POSITIVE_INFINITY) > currentNode.currentCost) { + this.returnedValues[currentNode.resultKey] = currentNode; + } else { + // Not a better cost, so reject it and move on to the next potential result. + return this.handleNextNode(); + } + + return { + type: 'complete', + cost: currentNode.currentCost, + finalNode: currentNode + }; + } else { + // Time to construct new edges for the next tier! + let inputIndex = currentNode.calculation.inputSequence.length; + + let deletionEdges: SearchNode[] = []; + if(!substitutionsOnly) { + deletionEdges = currentNode.buildDeletionEdges(this._inputSequence[inputIndex]); + } + const substitutionEdges = currentNode.buildSubstitutionEdges(this._inputSequence[inputIndex]); + let batch = deletionEdges.concat(substitutionEdges); + + // Skip the queue for the first pass; there will ALWAYS be at least one pass, + // and queue-enqueing does come with a cost - avoid unnecessary overhead here. + batch = batch.flatMap(e => e.processSubsetEdge()); + + // Note: we're live-modifying the tier's cost here! The priority queue loses its guarantees as a result. + this.selectionQueue.enqueueAll(batch); + + // We didn't reach an end-node, so we just end the iteration and continue the search. + } + + // If we've somehow fully exhausted all search options, indicate that none remain. + return unmatchedResult; + } +} \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts index 81c9fe8b291..4c8a6276f4e 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/model-compositor.ts @@ -7,6 +7,7 @@ import { applySuggestionCasing, correctAndEnumerate, dedupeSuggestions, finalize import { detectCurrentCasing, determineModelTokenizer, determineModelWordbreaker, determinePunctuationFromModel } from './model-helpers.js'; import { ContextTracker } from './correction/context-tracker.js'; +import { SearchQuotientSpur } from './correction/search-quotient-spur.js'; import CasingForm = LexicalModelTypes.CasingForm; import Configuration = LexicalModelTypes.Configuration; @@ -145,7 +146,7 @@ export class ModelCompositor { // Section 1: determine 'prediction roots' - enumerate corrections from most to least likely, // searching for results that yield viable predictions from the model. - const SEARCH_TIMEOUT = correction.SearchSpace.DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL; + const SEARCH_TIMEOUT = SearchQuotientSpur.DEFAULT_ALLOTTED_CORRECTION_TIME_INTERVAL; const timer = this.activeTimer = new correction.ExecutionTimer(this.testMode ? Number.MAX_VALUE : SEARCH_TIMEOUT, this.testMode ? Number.MAX_VALUE : SEARCH_TIMEOUT * 1.5); const { postContextState, rawPredictions, revertableTransitionId } = await correctAndEnumerate(this.contextTracker, this.lexicalModel, timer, transformDistribution, context); diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 30b53b149e0..f475bedf361 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -23,7 +23,7 @@ import Reversion = LexicalModelTypes.Reversion; import Suggestion = LexicalModelTypes.Suggestion; import SuggestionTag = LexicalModelTypes.SuggestionTag; import Transform = LexicalModelTypes.Transform; -import { ContextTransition } from './test-index.js'; +import { ContextTransition, getBestMatches } from './test-index.js'; /* * The functions in this file exist to provide unit-testable stateless components for the @@ -493,7 +493,7 @@ export async function correctAndEnumerate( let rawPredictions: CorrectionPredictionTuple[] = []; let bestCorrectionCost: number; const correctionPredictionMap: Record> = {}; - for await(const match of searchSpace.getBestMatches(timer)) { + for await(const match of getBestMatches(searchSpace, timer)) { // Corrections obtained: now to predict from them! const correction = match.matchString; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts index a65c99506e6..2c5c0214ada 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts @@ -4,6 +4,8 @@ export * from './correction/context-token.js'; export * from './correction/context-tokenization.js'; export { ContextTracker } from './correction/context-tracker.js'; export { ContextTransition } from './correction/context-transition.js'; +export * from './correction/distance-modeler.js'; +export * from './correction/search-quotient-spur.js'; export { ExtendedEditOperation, SegmentableDistanceCalculation } from './correction/segmentable-calculation.js'; export * from './correction/tokenization-subsets.js'; export * as correction from './correction/index.js'; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index edcb0b94c7b..c6f657f6a34 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -14,7 +14,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { ContextToken, correction, models, preprocessInputSources } from '@keymanapp/lm-worker/test-index'; +import { ContextToken, correction, getBestMatches, models, preprocessInputSources } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; import ExecutionTimer = correction.ExecutionTimer; @@ -59,7 +59,7 @@ describe('ContextToken', function() { assert.isFalse(token.isWhitespace); // While searchSpace has no inputs, it _can_ match lexicon entries (via insertions). - let searchIterator = token.searchSpace.getBestMatches(new ExecutionTimer(Number.POSITIVE_INFINITY, Number.POSITIVE_INFINITY)); + let searchIterator = getBestMatches(token.searchSpace, new ExecutionTimer(Number.POSITIVE_INFINITY, Number.POSITIVE_INFINITY)); let firstEntry = await searchIterator.next(); assert.isFalse(firstEntry.done); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts index 7e1657e77bc..c72917bed73 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/distance-modeler.tests.ts @@ -13,12 +13,11 @@ import { PriorityQueue } from '@keymanapp/web-utils'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { correction, models } from '@keymanapp/lm-worker/test-index'; +import { correction, models, QUEUE_NODE_COMPARATOR } from '@keymanapp/lm-worker/test-index'; import SENTINEL_CODE_UNIT = models.SENTINEL_CODE_UNIT; import Distribution = LexicalModelTypes.Distribution; import SearchNode = correction.SearchNode; -import SearchResult = correction.SearchResult; import Transform = LexicalModelTypes.Transform; import TrieModel = models.TrieModel; @@ -34,10 +33,6 @@ const toKey = (s: string) => testModel.toKey(s); */ const FIRST_CHAR_VARIANTS = 24; -function buildTestTimer() { - return new correction.ExecutionTimer(Number.MAX_VALUE, Number.MAX_VALUE); -} - function assertEdgeChars(edge: correction.SearchNode, input: string, match: string) { assert.isTrue(edgeHasChars(edge, input, match)); } @@ -116,8 +111,8 @@ function fetchCommonTENode() { return teNode; } -describe('Correction Distance Modeler', function() { - describe('SearchNode', function() { +describe('Correction Distance Modeler', () => { + describe('SearchNode', () => { it('constructs a fresh instance from a traversal + keyingFunction', () => { const rootNode = new SearchNode(testModel.traverseFromRoot(), toKey); assert.equal(rootNode.resultKey, ''); @@ -1085,7 +1080,7 @@ describe('Correction Distance Modeler', function() { let layer1Edges = rootNode.buildSubstitutionEdges(synthDistribution1) // No 2+ inserts here; we're fine with just one call. .flatMap(e => e.processSubsetEdge()); - let layer1Queue = new PriorityQueue(correction.QUEUE_NODE_COMPARATOR, layer1Edges); + const layer1Queue = new PriorityQueue(QUEUE_NODE_COMPARATOR, layer1Edges); let tEdge = layer1Queue.dequeue(); assertEdgeChars(tEdge, 't', 't'); @@ -1093,7 +1088,7 @@ describe('Correction Distance Modeler', function() { let layer2Edges = tEdge.buildSubstitutionEdges(synthDistribution2) // No 2+ inserts here; we're fine with just one call. .flatMap(e => e.processSubsetEdge()); - let layer2Queue = new PriorityQueue(correction.QUEUE_NODE_COMPARATOR, layer2Edges); + const layer2Queue = new PriorityQueue(QUEUE_NODE_COMPARATOR, layer2Edges); let eEdge = layer2Queue.dequeue(); assertEdgeChars(eEdge, 'e', 'e'); @@ -1114,7 +1109,7 @@ describe('Correction Distance Modeler', function() { .flatMap(e => e.processSubsetEdge()); let layer3ehEdges = ehEdge.buildSubstitutionEdges(synthDistribution3) .flatMap(e => e.processSubsetEdge()); - let layer3Queue = new PriorityQueue(correction.QUEUE_NODE_COMPARATOR, layer3eEdges.concat(layer3hEdges).concat(layer3ehEdges)); + const layer3Queue = new PriorityQueue(QUEUE_NODE_COMPARATOR, layer3eEdges.concat(layer3hEdges).concat(layer3ehEdges)); // Find the first result with an actual word directly represented. let bestEdge; @@ -1149,184 +1144,4 @@ describe('Correction Distance Modeler', function() { assert.isAbove(sibling2.currentCost, sibling1.currentCost); }); }); - - describe('SearchSpaceTier + SearchSpace', function() { - let checkRepeatableResults_teh = async function(iter: AsyncGenerator) { - let firstIterResult = await iter.next(); // {value: , done: } - assert.isFalse(firstIterResult.done); - - const firstResult: correction.SearchResult = firstIterResult.value; // Retrieves - // No checks on the first set's cost. - assert.equal(firstResult.matchString, "ten"); - - // All start with 'te' but one, and invoke one edit of the same cost. - // 'th' has an 'h' at the same cost (input 3) of the 'e' (input 2). - let secondBatch = [ - 'tec', 'tel', 'tem', - 'ter', 'tes', 'th', - 'te' - ]; - - async function checkBatch(batch: string[], prevCost: number) { - let cost; - while(batch.length > 0) { - const iter_result = await iter.next(); - assert.isFalse(iter_result.done); - - const result = iter_result.value; - assert.isAbove(result.totalCost, prevCost); - if(cost !== undefined) { - assert.equal(result.totalCost, cost); - } else { - cost = result.totalCost; - } - - const matchIndex = batch.findIndex((entry) => entry == result.matchString); - assert.notEqual(matchIndex, -1, `'${result.matchString}' received as prediction too early`); - batch.splice(matchIndex, 1); - } - - return cost; - } - - const secondCost = await checkBatch(secondBatch, firstResult.totalCost); - - // Single hard edit, all other input probability aspects are equal - let thirdBatch = [ - // 't' -> 'b' (sub) - 'beh', - // '' -> 'c' (insertion) - 'tech' - ]; - - await checkBatch(thirdBatch, secondCost); - - // All replace the low-likelihood case for the third input. - let fourthBatch = [ - 'the', 'thi', 'tho', 'thr', - 'thu', 'tha' - ]; - - await checkBatch(fourthBatch, secondCost); - - // Replace the _first_ input's char OR insert an extra char, - // also matching the low-likelihood third-char option. - let fifthBatch = [ - 'cen', 'en', 'gen', - 'ken', 'len', 'men', - 'sen', 'then', 'wen' - ]; - - await checkBatch(fifthBatch, secondCost); - } - - it('Simple search without input', async function() { - // The combinatorial effect here is a bit much to fully test. - let rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); - - let searchSpace = new correction.SearchSpace(testModel); - - let iter = searchSpace.getBestMatches(buildTestTimer()); - let firstResult = await iter.next(); - assert.isFalse(firstResult.done); - }); - - // Hmm... how best to update this... - it('Simple search (paralleling "Small integration test")', async function() { - // The combinatorial effect here is a bit much to fully test. - let rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); - - let searchSpace = new correction.SearchSpace(testModel); - - // VERY artificial distributions. - let synthDistribution1 = [ - {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability - ]; - - let synthDistribution2 = [ - {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} - ]; - - let synthDistribution3 = [ - {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} - ]; - - searchSpace.addInput(synthDistribution1, 1); - searchSpace.addInput(synthDistribution2, .75); - searchSpace.addInput(synthDistribution3, .25); - - let iter = searchSpace.getBestMatches(buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter); - }); - - it('Allows reiteration (sequentially)', async function() { - // The combinatorial effect here is a bit much to fully test. - let rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); - - let searchSpace = new correction.SearchSpace(testModel); - - // VERY artificial distributions. - let synthDistribution1 = [ - {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability - ]; - - let synthDistribution2 = [ - {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} - ]; - - let synthDistribution3 = [ - {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability - {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} - ]; - - searchSpace.addInput(synthDistribution1, 1); - searchSpace.addInput(synthDistribution2, .75); - searchSpace.addInput(synthDistribution3, .25); - - let iter = searchSpace.getBestMatches(buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter); - - // The key: do we get the same results the second time? - // Reset the iterator first... - let iter2 = searchSpace.getBestMatches(buildTestTimer()); // disables the correction-search timeout. - await checkRepeatableResults_teh(iter2); - }); - - it('Empty search space, loaded model', async function() { - // The combinatorial effect here is a bit much to fully test. - let rootTraversal = testModel.traverseFromRoot(); - assert.isNotEmpty(rootTraversal); - - let searchSpace = new correction.SearchSpace(testModel); - const timer = buildTestTimer(); - let iter = searchSpace.getBestMatches(timer); - - // While there's no input, insertion operations can produce suggestions. - let resultState = await iter.next(); - let result: SearchResult = resultState.value; - - // Just one suggestion root should be returned as the first result. - assert.equal(result.totalCost, 0); // Gives a perfect match - assert.equal(result.inputSequence.length, 0); // for a state with no input and - assert.equal(result.matchString, ''); // an empty match string. - assert.isFalse(resultState.done); - - // Should be able to reach more, though. - let laterResultState = await iter.next(); - let laterResult: SearchResult = laterResultState.value; - - // Edit required: an 'insertion' edge (no input matched, but char pulled - // from lexicon) - assert.isAbove(laterResult.totalCost, 0); - // The most likely word in the lexicon starts with 't'. - assert.equal(laterResult.matchString, 't'); - assert.isFalse(resultState.done); - }); - }); }); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/getBestMatches.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/getBestMatches.tests.ts new file mode 100644 index 00000000000..6c07ea9ed4e --- /dev/null +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/getBestMatches.tests.ts @@ -0,0 +1,203 @@ +/* + * Keyman is copyright (C) SIL Global. MIT License. + * + * Created by jahorton on 2025-10-09 + * + * This file defines tests for the correction-searching process of the + * predictive-text correction-search engine. + */ + +import { assert } from 'chai'; + +import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; +import { correction, getBestMatches, models, SearchQuotientSpur } from '@keymanapp/lm-worker/test-index'; + +import SearchResult = correction.SearchResult; +import TrieModel = models.TrieModel; + +const testModel = new TrieModel(jsonFixture('models/tries/english-1000')); + +function buildTestTimer() { + return new correction.ExecutionTimer(Number.MAX_VALUE, Number.MAX_VALUE); +} + +describe('getBestMatches', () => { + const checkRepeatableResults_teh = async (iter: AsyncGenerator) => { + const firstIterResult = await iter.next(); // {value: , done: } + assert.isFalse(firstIterResult.done); + + const firstResult: correction.SearchResult = firstIterResult.value; // Retrieves + // No checks on the first set's cost. + assert.equal(firstResult.matchString, "ten"); + + // All start with 'te' but one, and invoke one edit of the same cost. + // 'th' has an 'h' at the same cost (input 3) of the 'e' (input 2). + const secondBatch = [ + 'tec', 'tel', 'tem', + 'ter', 'tes', 'th', + 'te' + ]; + + async function checkBatch(batch: string[], prevCost: number) { + let cost; + while(batch.length > 0) { + const iter_result = await iter.next(); + assert.isFalse(iter_result.done); + + const result = iter_result.value; + assert.isAbove(result.totalCost, prevCost); + if(cost !== undefined) { + assert.equal(result.totalCost, cost); + } else { + cost = result.totalCost; + } + + const matchIndex = batch.findIndex((entry) => entry == result.matchString); + assert.notEqual(matchIndex, -1, `'${result.matchString}' received as prediction too early`); + batch.splice(matchIndex, 1); + } + + return cost; + } + + const secondCost = await checkBatch(secondBatch, firstResult.totalCost); + + // Single hard edit, all other input probability aspects are equal + const thirdBatch = [ + // 't' -> 'b' (sub) + 'beh', + // '' -> 'c' (insertion) + 'tech' + ]; + + await checkBatch(thirdBatch, secondCost); + + // All replace the low-likelihood case for the third input. + const fourthBatch = [ + 'the', 'thi', 'tho', 'thr', + 'thu', 'tha' + ]; + + await checkBatch(fourthBatch, secondCost); + + // Replace the _first_ input's char OR insert an extra char, + // also matching the low-likelihood third-char option. + const fifthBatch = [ + 'cen', 'en', 'gen', + 'ken', 'len', 'men', + 'sen', 'then', 'wen' + ]; + + await checkBatch(fifthBatch, secondCost); + } + + it('Simple search without input', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchSpace = new SearchQuotientSpur(testModel); + + const iter = getBestMatches(searchSpace, buildTestTimer()); + const firstResult = await iter.next(); + assert.isFalse(firstResult.done); + }); + + // Hmm... how best to update this... + it('Simple search (paralleling "Small integration test")', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchSpace = new SearchQuotientSpur(testModel); + + // VERY artificial distributions. + const synthInput1 = [ + {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability + ]; + + const synthInput2 = [ + {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} + ]; + + const synthInput3 = [ + {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} + ]; + + searchSpace.addInput(synthInput1, 1); + searchSpace.addInput(synthInput2, .75); + searchSpace.addInput(synthInput3, .75); + + const iter = getBestMatches(searchSpace, buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter); + }); + + it('Allows reiteration (sequentially)', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchSpace = new SearchQuotientSpur(testModel); + + + // VERY artificial distributions. + const synthInput1 = [ + {sample: {insert: 't', deleteLeft: 0}, p: 1} // Transform, probability + ]; + + const synthInput2 = [ + {sample: {insert: 'e', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'h', deleteLeft: 0}, p: 0.25} + ]; + + const synthInput3 = [ + {sample: {insert: 'h', deleteLeft: 0}, p: 0.75}, // Transform, probability + {sample: {insert: 'n', deleteLeft: 0}, p: 0.25} + ]; + + searchSpace.addInput(synthInput1, 1); + searchSpace.addInput(synthInput2, .75); + searchSpace.addInput(synthInput3, .75); + + const iter = getBestMatches(searchSpace, buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter); + + // The key: do we get the same results the second time? + // Reset the iterator first... + const iter2 = getBestMatches(searchSpace, buildTestTimer()); // disables the correction-search timeout. + await checkRepeatableResults_teh(iter2); + }); + + it('Empty search space, loaded model', async () => { + // The combinatorial effect here is a bit much to fully test. + const rootTraversal = testModel.traverseFromRoot(); + assert.isNotEmpty(rootTraversal); + + const searchSpace = new SearchQuotientSpur(testModel); + const timer = buildTestTimer(); + const iter = getBestMatches(searchSpace, timer); + + // While there's no input, insertion operations can produce suggestions. + const resultState = await iter.next(); + const result: SearchResult = resultState.value; + + // Just one suggestion root should be returned as the first result. + assert.equal(result.totalCost, 0); // Gives a perfect match + assert.equal(result.inputSequence.length, 0); // for a state with no input and + assert.equal(result.matchString, ''); // an empty match string. + assert.isFalse(resultState.done); + + // Should be able to reach more, though. + const laterResultState = await iter.next(); + const laterResult: SearchResult = laterResultState.value; + + // Edit required: an 'insertion' edge (no input matched, but char pulled + // from lexicon) + assert.isAbove(laterResult.totalCost, 0); + // The most likely word in the lexicon starts with 't'. + assert.equal(laterResult.matchString, 't'); + assert.isFalse(resultState.done); + }); +});