diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts index cdb4688246a..87a9970d191 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts @@ -7,7 +7,6 @@ * in the context and associated correction-search progress and results. */ -import { buildMergedTransform } from "@keymanapp/models-templates"; import { LexicalModelTypes } from '@keymanapp/common-types'; import { deepCopy, KMWString } from "@keymanapp/web-utils"; @@ -182,59 +181,19 @@ export class ContextToken { * @param lexicalModel * @returns */ - static merge(tokensToMerge: ContextToken[], lexicalModel: LexicalModel): ContextToken { + static merge(tokensToMerge: ContextToken[]): ContextToken { + if(tokensToMerge.length < 1) { + return null; + } + // Assumption: if we're merging a token, it's not whitespace. // Thus, we don't set the .isWhitespace flag field. - const resultToken = new ContextToken(lexicalModel); - - let lastSourceInput: PathInputProperties; - let lastInputDistrib: Distribution; - for(const token of tokensToMerge) { - const inputCount = token.inputCount; - let startIndex = 0; - - if(inputCount == 0) { - continue; - } - - // Are we re-merging on a previously split transform? - if(lastSourceInput?.segment.trueTransform != token.inputSegments[0].segment.trueTransform) { - if(lastSourceInput) { - resultToken.addInput(lastSourceInput, lastInputDistrib); - } // else: there's nothing to add as input - } else { - // If so, re-merge it! - startIndex++; - - lastInputDistrib = lastInputDistrib?.map((entry, index) => { - return { - sample: buildMergedTransform(entry.sample, token.searchModule.inputSequence[0][index].sample), - p: entry.p - } - }); - - // In case there's only one input that needs merging on both ends. - if(inputCount == 1) { - // There's potential that the next incoming token needs to merge with this. - continue; - } else { - resultToken.addInput(lastSourceInput, lastInputDistrib); - } - } - lastSourceInput = null; - lastInputDistrib = null; - - // Ignore the last entry for now - it may need to merge with a matching - // entry in the next token! - for(let i = startIndex; i < inputCount - 1; i++) { - resultToken.addInput(token.inputSegments[i], token.searchModule.inputSequence[i]); - } - lastSourceInput = token.inputSegments[inputCount-1]; - lastInputDistrib = token.searchModule.inputSequence[inputCount-1]; + const resultToken = new ContextToken(tokensToMerge.shift()); + while(tokensToMerge.length > 0) { + const next = tokensToMerge.shift(); + resultToken._searchModule = resultToken._searchModule.merge(next._searchModule); } - resultToken.addInput(lastSourceInput, lastInputDistrib); - return resultToken; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index 2ace532847f..89857fecc6a 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -526,7 +526,7 @@ export class ContextTokenization { // consider: move to ContextToken as class method. (static?) const merge = merges.shift(); const tokensToMerge = merge.inputs.map((m) => baseTokenization[m.index]); - const mergeResult = ContextToken.merge(tokensToMerge, lexicalModel); + const mergeResult = ContextToken.merge(tokensToMerge); tokenization.push(mergeResult); i = merge.inputs[merge.inputs.length - 1].index; continue; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/legacy-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/legacy-quotient-spur.ts index 71dfc87d9ee..bcc3f1893fc 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/legacy-quotient-spur.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/legacy-quotient-spur.ts @@ -44,7 +44,7 @@ export class LegacyQuotientSpur extends SearchQuotientSpur { return; } - protected construct(parentNode: SearchQuotientNode, inputs?: Distribution, inputSource?: PathInputProperties): this { + construct(parentNode: SearchQuotientNode, inputs?: Distribution, inputSource?: PathInputProperties): this { return new LegacyQuotientSpur(parentNode, inputs, inputSource) as this; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts index 7ee2cc23cb9..472df52fea8 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-node.ts @@ -3,7 +3,7 @@ * * Created by jahorton on 2025-10-09 * - * This file defines the predictive-text engine's SearchSpace class, which is used to + * This file defines the predictive-text engine's SearchQuotientNode class, which is used to * manage the search-space(s) for text corrections within the engine. */ @@ -179,7 +179,7 @@ export interface SearchQuotientNode { readonly inputCount: number; /** - * Retrieves the sequence of inputs that led to this SearchSpace. + * Retrieves the sequence of inputs that led to this SearchQuotientNode. * * THIS WILL BE REMOVED SHORTLY in favor of `constituentPaths` below, which * provides an improved view into the data and models multiple paths to the @@ -215,7 +215,16 @@ export interface SearchQuotientNode { get sourceRangeKey(): string; /** - * Splits this SearchSpace into two halves at the specified codepoint index. + * Appends this SearchQuotientNode with the provided SearchQuotientNode's search properties, + * extending the represented search range accordingly. If this operation + * represents merging the result of a previous .split() call, the two halves + * of any split input components will be fully re-merged. + * @param space + */ + merge(space: SearchQuotientNode): SearchQuotientNode; + + /** + * Splits this SearchQuotientNode into two halves at the specified codepoint index. * The 'head' component will maximally re-use existing cached data, while the * 'tail' must be reconstructed from scratch due to the new start position. * @param charIndex diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-root.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-root.ts index 151f5c0f4bc..6b72ef22316 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-root.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-root.ts @@ -3,6 +3,7 @@ import { LexicalModelTypes } from '@keymanapp/common-types'; import { SearchNode, SearchResult } from './distance-modeler.js'; import { generateSpaceSeed, PathInputProperties, PathResult, SearchQuotientNode } from './search-quotient-node.js'; +import { SearchQuotientSpur } from './search-quotient-spur.js'; import LexicalModel = LexicalModelTypes.LexicalModel; @@ -105,4 +106,31 @@ export class SearchQuotientRoot implements SearchQuotientNode { split(charIndex: number): [SearchQuotientNode, SearchQuotientNode] { return [this, new SearchQuotientRoot(this.model)]; } + + merge(space: SearchQuotientNode): SearchQuotientNode { + // Head node for the incoming path is empty, so skip it. + if(space.parents.length == 0 || space instanceof SearchQuotientRoot) { + return this; + } + + // Merge any parents first as a baseline. We have to come after their + // affects are merged in, anyway. + const parentMerges = space.parents?.length > 0 ? space.parents.map((p) => this.merge(p)) : [this]; + + // if parentMerges.length > 0, is a SearchCluster. + // const parentMerge = parentMerges.length > 0 ? new SearchCluster(parentMerges) : parentMerges[0]; + const parentMerge = parentMerges[0]; + + // Special case: if we've reached the head of the space to be merged, check + // for a split transform. + // - we return `this` from the root, so if that's what we received, we're + // on the first descendant - the first path component. + if(space instanceof SearchQuotientSpur) { + // Needs to construct a NEW version of whatever the same type is, on this root. + return space.construct(parentMerge, space.inputs, space.inputSource); + } else { + // If the parent was a cluster, the cluster itself is the merge. + return parentMerge; + } + } } \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts index 449f8ff3c99..dde8ed35294 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-quotient-spur.ts @@ -10,10 +10,12 @@ import { QueueComparator as Comparator, KMWString, PriorityQueue } from '@keymanapp/web-utils'; import { LexicalModelTypes } from '@keymanapp/common-types'; +import { buildMergedTransform } from '@keymanapp/models-templates'; import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js'; import { generateSpaceSeed, PathResult, SearchQuotientNode, PathInputProperties } from './search-quotient-node.js'; import { generateSubsetId } from './tokenization-subsets.js'; +import { SearchQuotientRoot } from './search-quotient-root.js'; import { LegacyQuotientRoot } from './legacy-quotient-root.js'; import Distribution = LexicalModelTypes.Distribution; @@ -155,13 +157,102 @@ export abstract class SearchQuotientSpur implements SearchQuotientNode { this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, entries); } - /** Allows the base class to construct instances of the derived class. */ - protected abstract construct( + /** + * Allows construction of new spur instances matching this spur's edge type. + * + * Target use cases: + * - `SearchQuotientNode.split()` + * - an edge may need to be split into two parts + * - edges may need to be recreated on a shortened search path (for the + * split's right-hand side) + * - `SearchQuotientNode.merge()` + * - two parts may need to be recombined into a single edge + * - edges from the 'right-hand side' may need to be recreated on the + * left-hand side for the merged quotient path + * @param parentNode + * @param inputs + * @param inputSource + */ + abstract construct( parentNode: SearchQuotientNode, - inputs?: Distribution, - inputSource?: PathInputProperties + inputs: Distribution, + inputSource: PathInputProperties ): this; + // spaces are in sequence here. + // `this` = head 'space'. + public merge(space: SearchQuotientNode): SearchQuotientNode { + // Head node for the incoming path is empty, so skip it. + if(space.parents.length == 0 || space instanceof SearchQuotientRoot) { + return this; + } + + // Merge any parents first as a baseline. We have to come after their + // affects are merged in, anyway. + const parentMerges = space.parents?.length > 0 ? space.parents.map((p) => this.merge(p)) : [this]; + + // if parentMerges.length > 0, is a SearchCluster. + const parentMerge = parentMerges[0]; + + // Special case: if we've reached the head of the space to be merged, check + // for a split transform. + // - we return `this` from the root, so if that's what we received, we're + // on the first descendant - the first path component. + if(space instanceof SearchQuotientSpur) { + if(parentMerge != this) { + // Here, we reconstruct the child `space` on a new root. The new + // instance needs to be of the same type as the original instance. + return space.construct(parentMerge, space.inputs, space.inputSource); + } + + const localInputId = this.inputSource?.segment.transitionId; + const spaceInputId = space.inputSource?.segment.transitionId; + // The 'id' may be undefined in some unit tests and for tokens + // reconstructed after a backspace. In either case, we consider the + // related results as fully separate; our reconstructions are + // per-codepoint. + if(localInputId != spaceInputId || localInputId === undefined) { + return space.construct(parentMerge, space.inputs, space.inputSource); + } + // Get the twin halves that were split. + // Assumption: the two halves are in their original order, etc. + const localInputs = this.inputs; + const spaceInputs = space.inputs; + + // Sanity check - ensure that the input distributions have the same length; + // if not, this shouldn't represent a SearchPath split! + if(localInputs.length != spaceInputs.length) { + return space.construct(parentMerge, space.inputs, space.inputSource); + } + + // Merge them! + const mergedInputs = localInputs?.map((entry, index) => { + return { + sample: buildMergedTransform(entry.sample, spaceInputs[index].sample), + p: entry.p + } + }); + + const mergedInputSource = { + ...this.inputSource, + segment: { + ...this.inputSource.segment, + end: space.inputSource.segment.end + } + }; + + if(mergedInputSource.segment.end == undefined) { + delete mergedInputSource.segment.end; + } + + // Now to re-merge the two halves. + return space.construct(this.parentNode, mergedInputs, mergedInputSource); + } else { + // If the parent was a cluster, the cluster itself is the merge. + return parentMerge; + } + } + public split(charIndex: number): [SearchQuotientNode, SearchQuotientNode] { const internalSplitIndex = charIndex - (this.codepointLength - this.insertLength); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index f78d9665c06..cdb4bb7b543 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -107,7 +107,7 @@ describe('ContextToken', function() { const token2 = new ContextToken(plainModel, "'"); const token3 = new ContextToken(plainModel, "t"); - const merged = ContextToken.merge([token1, token2, token3], plainModel); + const merged = ContextToken.merge([token1, token2, token3]); assert.equal(merged.exampleInput, "can't"); token1.inputSegments.forEach((entry) => assert.isTrue(merged.inputSegments.indexOf(entry) > -1)); token2.inputSegments.forEach((entry) => assert.isTrue(merged.inputSegments.indexOf(entry) > -1)); @@ -161,7 +161,7 @@ describe('ContextToken', function() { subsetId: srcSubsetId }, [{sample: {insert: 't', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]); - const merged = ContextToken.merge([token1, token2, token3], plainModel); + const merged = ContextToken.merge([token1, token2, token3]); assert.equal(merged.exampleInput, "can't"); assert.deepEqual(merged.inputSegments, [ { segment: { @@ -259,7 +259,7 @@ describe('ContextToken', function() { subsetId: srcSubsetIds[3] }, [{sample: srcTransforms[3], p: 1}]); - const merged = ContextToken.merge(tokensToMerge, plainModel); + const merged = ContextToken.merge(tokensToMerge); assert.equal(merged.exampleInput, "applesandsourgrapes"); assert.deepEqual(merged.inputSegments, srcTransforms.map((t, i) => ({ segment: { @@ -359,7 +359,7 @@ describe('ContextToken', function() { subsetId: srcSubsetIds[3] }, [{sample: srcTransforms[3], p: 1}]); - const merged = ContextToken.merge(tokensToMerge, plainModel); + const merged = ContextToken.merge(tokensToMerge); assert.equal(merged.exampleInput, toMathematicalSMP("applesandsourgrapes")); assert.deepEqual(merged.inputSegments, srcTransforms.map((t, i) => ({ segment: { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts index 9ebb1f8cfc7..56440b418bf 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-quotient-spur.tests.ts @@ -3,16 +3,29 @@ * * Created by jahorton on 2025-10-29 * - * This file defines tests for the SearchSpace class of the + * This file defines tests for the SearchQuotientSpur classes of the * predictive-text correction-search engine. */ import { assert } from 'chai'; +import { LexicalModelTypes } from '@keymanapp/common-types'; import { KMWString } from '@keymanapp/web-utils'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; -import { LegacyQuotientSpur, models, LegacyQuotientRoot, unitTestEndpoints, SearchQuotientNode, SearchQuotientSpur, SearchQuotientRoot } from '@keymanapp/lm-worker/test-index'; - +import { + generateSubsetId, + LegacyQuotientRoot, + LegacyQuotientSpur, + models, + PathInputProperties, + SearchQuotientNode, + SearchQuotientRoot, + SearchQuotientSpur, + unitTestEndpoints +} from '@keymanapp/lm-worker/test-index'; + +import Distribution = LexicalModelTypes.Distribution; +import Transform = LexicalModelTypes.Transform; import TrieModel = models.TrieModel; const { constituentPaths, quotientPathHasInputs } = unitTestEndpoints; @@ -320,7 +333,7 @@ describe('SearchQuotientSpur', () => { assert.sameOrderedMembers(pathSequence, paths.slice(1)); }); - // TODO: add a test for mixed SearchPath / SearchCluster cases. + // TODO: add a test for mixed SearchQuotientSpur / SearchCluster cases. }); describe('split()', () => { @@ -1336,4 +1349,575 @@ describe('SearchQuotientSpur', () => { assert.deepEqual((tail as LegacyQuotientSpur).inputSource, tailTarget.inputSource); }); }); + + // Placed after `split()` because many cases mock a reversal of split-test results. + describe('merge()', () => { + /* + * To define: + * - merging a standard case + * - merging a split BMP case + * - merging a standard SMP case + * - merging a split SMP case + * - merging a case where the deleteLeft was split from the insert + * - splitIndex = 0, but the deleteLeft is (conceptually) before that. + * - this (empty) + param (full) + * - this (full) + param (empty) + * - merging with distributions (no split) + * - merging with distributions (and a definite split) + * + * - biglargetransform for single-input multi-split remerge + * - merging a three-way split should be associative (not dependent on order) so + * long as the relative positions are correct + * + * - "cello" case(s) covers... + * - deleteLeft split from insert + * - a straight-up split (mid-insert) + * - standard case (no distrib) + * - with head + tail index inclusion, the empty + full versions + * - SMP variant: the SMP cases. + * + * - then we may need a "merging with distributions" coverage + * - can prob make a simple BMP mocked version... + * - and a simple SMP mocked version + * - is actually pretty-much covered anyway... I believe. + */ + + // Covers cases where a single "input" was split into more than two fragments + describe(`previously-split token comprised of single titanic transform: biglargetransform`, () => { + const buildPath = () => { + const distributions = [ + [{ sample: {insert: 'big', deleteLeft: 0, id: 11}, p: 1 }], + [{ sample: {insert: 'large', deleteLeft: 0, id: 11}, p: 1 }], + [{ sample: {insert: 'transform', deleteLeft: 0, id: 11}, p: 1 }] + ]; + + const originalInputBase: PathInputProperties = { + segment: { + trueTransform: {insert: 'biglargetransform', deleteLeft: 0, id: 11}, + start: 0, + transitionId: 11 + }, + bestProbFromSet: 1, + subsetId: generateSubsetId() + }; + + const splitOriginalInputs = [0, 3, 8].map(n => ({ + ...originalInputBase, + segment: { + ...originalInputBase.segment, + start: n + } + })); + splitOriginalInputs[0].segment.end = 3; + splitOriginalInputs[1].segment.end = 8; + + const paths = distributions.map((d, i) => new LegacyQuotientSpur(new LegacyQuotientRoot(testModel), d, splitOriginalInputs[i])); + + return { + paths, + distributions, + splitOriginalInputs, + originalInput: originalInputBase + }; + } + + const checkFinalStateAssertions = (merged: SearchQuotientSpur, originalInput: PathInputProperties) => { + assert.equal(merged.inputCount, 1); + assert.isTrue(merged instanceof SearchQuotientSpur); + assert.deepEqual(merged.bestExample.text, "biglargetransform"); + assert.deepEqual((merged as SearchQuotientSpur).inputs, [ + { sample: { insert: 'biglargetransform', deleteLeft: 0, id: 11 }, p: 1 } + ]); + assert.deepEqual((merged as SearchQuotientSpur).inputSource, originalInput); + // TODO: check the 'source' input data (here and in callers) + } + + it('setup: constructs paths properly', () => { + const { paths, distributions, splitOriginalInputs: originalInputs } = buildPath(); + + assert.equal(paths.length, 3); + assert.equal(distributions.length, paths.length); + paths.forEach((p, i) => { + assert.equal(p.inputCount, 1); + assert.equal(distributions[i].length, p.inputCount); + assert.equal(p.codepointLength, KMWString.length(distributions[i][0].sample.insert)); + assert.deepEqual(p.bestExample, { + text: ['big', 'large', 'transform'][i], + p: 1 + }); + assert.equal(p.parents[0].inputCount, 0); + assert.isTrue(quotientPathHasInputs(p, [distributions[i]])); + }); + + originalInputs.forEach((original) => { + assert.equal(original.segment.transitionId, originalInputs[0].segment.transitionId); + assert.equal(original.bestProbFromSet, originalInputs[0].bestProbFromSet); + assert.equal(original.subsetId, originalInputs[0].subsetId); + }); + }); + + it('merging order: big + large, then + transform', () => { + const { originalInput, paths, splitOriginalInputs } = buildPath(); + + const headMerge = paths[0].merge(paths[1]); + + // Assertions + assert.equal(headMerge.inputCount, 1); + assert.isTrue(headMerge instanceof SearchQuotientSpur); + assert.deepEqual(headMerge.bestExample.text, "biglarge"); + assert.deepEqual((headMerge as SearchQuotientSpur).inputs, [ + { sample: { insert: 'biglarge', deleteLeft: 0, id: 11 }, p: 1 } + ]); + assert.deepEqual((headMerge as SearchQuotientSpur).inputSource, { + ...originalInput, + segment: { + ...splitOriginalInputs[0].segment, + end: splitOriginalInputs[1].segment.end + } + }); + + const fullMerge = headMerge.merge(paths[2]); + checkFinalStateAssertions(fullMerge as SearchQuotientSpur, originalInput); + }); + + it('merging order: large + transform, then + big', () => { + const { originalInput, paths, splitOriginalInputs } = buildPath(); + + const tailMerge = paths[1].merge(paths[2]); + + // Assertions + assert.equal(tailMerge.inputCount, 1); + assert.isTrue(tailMerge instanceof SearchQuotientSpur); + assert.deepEqual(tailMerge.bestExample.text, "largetransform"); + assert.deepEqual((tailMerge as SearchQuotientSpur).inputs, [ + { sample: { insert: 'largetransform', deleteLeft: 0, id: 11 }, p: 1 } + ]); + assert.deepEqual((tailMerge as SearchQuotientSpur).inputSource, { + ...originalInput, + segment: { + ...splitOriginalInputs[2].segment, + start: splitOriginalInputs[1].segment.start + } + }); + + const fullMerge = paths[0].merge(tailMerge); + checkFinalStateAssertions(fullMerge as SearchQuotientSpur, originalInput); + }); + }); + + // Covers many common aspects of SearchQuotientSpur merging, though not merging of + // multi-member distributions. + describe(`previously-split token comprised of complex, rewriting transforms: cello`, () => { + const buildPath = (inputs: Distribution[], sources: PathInputProperties[], root?: SearchQuotientNode) => { + return inputs.reduce((path, input, index) => new LegacyQuotientSpur(path, input, sources[index]), root ?? new LegacyQuotientRoot(testModel)); + } + + const buildFixtures = () => { + const trueDistributions = [ + [ + { sample: {insert: 'ca', deleteLeft: 0, id: 11}, p: 1 } + ], [ + { sample: {insert: 'ent', deleteLeft: 1, id: 12}, p: 1 } + ], [ + { sample: {insert: 'llar', deleteLeft: 2, id: 13}, p: 1 } + ], [ + { sample: {insert: 'o', deleteLeft: 2, id: 14}, p: 1 } + ] + ]; + + const trueInputSources: PathInputProperties[] = trueDistributions.map((d) => { + return { + segment: { + start: 0, + trueTransform: d[0].sample, + transitionId: d[0].sample.id + }, + bestProbFromSet: d[0].p, + subsetId: generateSubsetId() + } + }); + + const commonRoot = new LegacyQuotientRoot(testModel); + const mergeTarget = buildPath(trueDistributions, trueInputSources, commonRoot); + + // Index: the position of the split. + const splits: [SearchQuotientNode, SearchQuotientNode][] = []; + + // Case 0: bare head path, reproduced token (on different root) + splits.push([ + commonRoot, buildPath(trueDistributions, trueInputSources) + ]); + + // Case 1: the split happens in token 2 (index 1), with the deleteLeft + // split from the insert. + splits.push([ + buildPath([ + trueDistributions[0], + [{ sample: {insert: '', deleteLeft: 1, id: 12}, p: 1 }] + ], trueInputSources.slice(0, 2), commonRoot), + buildPath([ + [{ sample: {insert: 'ent', deleteLeft: 0, id: 12}, p: 1 }], + ...trueDistributions.slice(2) + ], [ + {...trueInputSources[1], segment: {...trueInputSources[1].segment, start: 0}}, + ...trueInputSources.slice(2) + ]) + ]); + + // Case 2: the split happens in token 3 (index 2), with the deleteLeft + // split from the insert. + splits.push([ + buildPath([ + ...trueDistributions.slice(0, 2), + [{ sample: {insert: '', deleteLeft: 2, id: 13}, p: 1 }] + ], trueInputSources.slice(0, 3), commonRoot), + buildPath([ + [{ sample: {insert: 'llar', deleteLeft: 0, id: 13}, p: 1 }], + ...trueDistributions.slice(3) + ], [ + {...trueInputSources[2], segment: {...trueInputSources[2].segment, start: 0}}, + ...trueInputSources.slice(3) + ]) + ]); + + // Case 3: the split happens in token 3 (index 2), in the middle of the + // insert. + splits.push([ + buildPath([ + ...trueDistributions.slice(0, 2), + [{ sample: {insert: 'l', deleteLeft: 2, id: 13}, p: 1 }] + ], trueInputSources.slice(0, 3), commonRoot), + buildPath([ + [{ sample: {insert: 'lar', deleteLeft: 0, id: 13}, p: 1 }], + ...trueDistributions.slice(3) + ], [ + {...trueInputSources[2], segment: {...trueInputSources[2].segment, start: 1}}, + ...trueInputSources.slice(3) + ]) + ]); + + // Case 4: the split happens in token 4 (index 3), with the deleteLeft + // split from the insert. + splits.push([ + buildPath([ + ...trueDistributions.slice(0, 3), + [{ sample: {insert: '', deleteLeft: 2, id: 14}, p: 1 }] + ], trueInputSources.slice(), commonRoot), + buildPath([ + [{ sample: {insert: 'o', deleteLeft: 0, id: 14}, p: 1 }] + ], [ + {...trueInputSources[3], segment: {...trueInputSources[3].segment, start: 0}}, + ]) + ]); + + // Case 5: the split happens at the token's end, leaving the tail + // as a fresh, empty token. + splits.push([ + buildPath(trueDistributions, trueInputSources, commonRoot), + new LegacyQuotientRoot(testModel) + ]); + + return { + mergeTarget, + splits, + trueDistributions + }; + } + + const runCommonAssertions = (splitIndex: number) => { + const { mergeTarget, splits, trueDistributions } = buildFixtures(); + const splitToTest = splits[splitIndex]; + + const remergedPath = splitToTest[0].merge(splitToTest[1]) as SearchQuotientSpur; + + assert.deepEqual(remergedPath.bestExample, mergeTarget.bestExample); + assert.equal(remergedPath.inputCount, mergeTarget.inputCount); + assert.equal(remergedPath.codepointLength, mergeTarget.codepointLength); + assert.sameDeepOrderedMembers(remergedPath.inputSegments, mergeTarget.inputSegments); + assert.isTrue(quotientPathHasInputs(remergedPath, trueDistributions)); + } + + it('setup: constructs path properly', () => { + const { mergeTarget, splits } = buildFixtures(); + + const targetText = mergeTarget.bestExample.text; + + for(let i = 0; i < splits.length; i++) { + const splitSet = splits[i]; + + assert.equal(splitSet[0].codepointLength, i); + assert.equal(splitSet[0].bestExample.text, KMWString.substring(targetText, 0, i)); + assert.equal(splitSet[1].codepointLength, KMWString.length(targetText) - i); + assert.equal(splitSet[1].bestExample.text, KMWString.substring(targetText, i)); + } + }); + + it('splits properly at index 0', () => { + runCommonAssertions(0); + }); + + it('splits properly at index 1', () => { + runCommonAssertions(1); + }); + + it('splits properly at index 2', () => { + runCommonAssertions(2); + }); + + it('splits properly at index 3', () => { + runCommonAssertions(3); + }); + + it('splits properly at index 4', () => { + runCommonAssertions(4); + }); + + it('splits properly at index 5', () => { + runCommonAssertions(5); + }); + }); + + // Same as the prior set, but now with non-BMP text! + describe(`previously-split token comprised of complex, rewriting non-BMP transforms`, () => { + const buildPath = (inputs: Distribution[], sources: PathInputProperties[], root?: SearchQuotientNode) => { + return inputs.reduce((path, input, index) => new LegacyQuotientSpur(path, input, sources[index]), root ?? new LegacyQuotientRoot(testModel)); + } + + const buildFixtures = () => { + const trueDistributions = [ + [ + { sample: {insert: toMathematicalSMP('ca'), deleteLeft: 0, id: 11}, p: 1 } + ], [ + { sample: {insert: toMathematicalSMP('ent'), deleteLeft: 1, id: 12}, p: 1 } + ], [ + { sample: {insert: toMathematicalSMP('llar'), deleteLeft: 2, id: 13}, p: 1 } + ], [ + { sample: {insert: toMathematicalSMP('o'), deleteLeft: 2, id: 14}, p: 1 } + ] + ]; + + const trueInputSources: PathInputProperties[] = trueDistributions.map((d) => { + return { + segment: { + start: 0, + trueTransform: d[0].sample, + transitionId: d[0].sample.id + }, + bestProbFromSet: d[0].p, + subsetId: generateSubsetId() + } + }); + + const commonRoot = new LegacyQuotientRoot(testModel); + const mergeTarget = buildPath(trueDistributions, trueInputSources, commonRoot); + + // Index: the position of the split. + const splits: [SearchQuotientNode, SearchQuotientNode][] = []; + + // Case 0: bare head path, reproduced token (on different root) + splits.push([ + commonRoot, buildPath(trueDistributions, trueInputSources) + ]); + + // Case 1: the split happens in token 2 (index 1), with the deleteLeft + // split from the insert. + splits.push([ + buildPath([ + trueDistributions[0], + [{ sample: {insert: toMathematicalSMP(''), deleteLeft: 1, id: 12}, p: 1 }] + ], trueInputSources.slice(0, 2), commonRoot), + buildPath([ + [{ sample: {insert: toMathematicalSMP('ent'), deleteLeft: 0, id: 12}, p: 1 }], + ...trueDistributions.slice(2) + ], [ + {...trueInputSources[1], segment: {...trueInputSources[1].segment, start: 0}}, + ...trueInputSources.slice(2) + ]) + ]); + + // Case 2: the split happens in token 3 (index 2), with the deleteLeft + // split from the insert. + splits.push([ + buildPath([ + ...trueDistributions.slice(0, 2), + [{ sample: {insert: toMathematicalSMP(''), deleteLeft: 2, id: 13}, p: 1 }] + ], trueInputSources.slice(0, 3), commonRoot), + buildPath([ + [{ sample: {insert: toMathematicalSMP('llar'), deleteLeft: 0, id: 13}, p: 1 }], + ...trueDistributions.slice(3) + ], [ + {...trueInputSources[2], segment: {...trueInputSources[2].segment, start: 0}}, + ...trueInputSources.slice(3) + ]) + ]); + + // Case 3: the split happens in token 3 (index 2), in the middle of the + // insert. + splits.push([ + buildPath([ + ...trueDistributions.slice(0, 2), + [{ sample: {insert: toMathematicalSMP('l'), deleteLeft: 2, id: 13}, p: 1 }] + ], trueInputSources.slice(0, 3), commonRoot), + buildPath([ + [{ sample: {insert: toMathematicalSMP('lar'), deleteLeft: 0, id: 13}, p: 1 }], + ...trueDistributions.slice(3) + ], [ + {...trueInputSources[2], segment: {...trueInputSources[2].segment, start: 1}}, + ...trueInputSources.slice(3) + ]) + ]); + + // Case 4: the split happens in token 4 (index 3), with the deleteLeft + // split from the insert. + splits.push([ + buildPath([ + ...trueDistributions.slice(0, 3), + [{ sample: {insert: toMathematicalSMP(''), deleteLeft: 2, id: 14}, p: 1 }] + ], trueInputSources.slice(), commonRoot), + buildPath([ + [{ sample: {insert: toMathematicalSMP('o'), deleteLeft: 0, id: 14}, p: 1 }] + ], [ + {...trueInputSources[3], segment: {...trueInputSources[3].segment, start: 0}}, + ]) + ]); + + // Case 5: the split happens at the token's end, leaving the tail + // as a fresh, empty token. + splits.push([ + buildPath(trueDistributions, trueInputSources, commonRoot), + new LegacyQuotientRoot(testModel) + ]); + + return { + mergeTarget, + splits, + trueDistributions + }; + } + + const runCommonAssertions = (splitIndex: number) => { + const { mergeTarget, splits, trueDistributions } = buildFixtures(); + const splitToTest = splits[splitIndex]; + + const remergedPath = splitToTest[0].merge(splitToTest[1]) as SearchQuotientSpur; + + assert.deepEqual(remergedPath.bestExample, mergeTarget.bestExample); + assert.equal(remergedPath.inputCount, mergeTarget.inputCount); + assert.equal(remergedPath.codepointLength, mergeTarget.codepointLength); + assert.sameDeepOrderedMembers(remergedPath.inputSegments, mergeTarget.inputSegments); + assert.isTrue(quotientPathHasInputs(remergedPath, trueDistributions)); + } + + it('setup: constructs path properly', () => { + // Validate that an SMP-conversion has occurred. + assert.notEqual(toMathematicalSMP("cello"), "cello"); + assert.equal(toMathematicalSMP("cello").length, "cello".length * 2); + assert.equal(KMWString.length(toMathematicalSMP("cello")), KMWString.length("cello")); + + const { mergeTarget, splits } = buildFixtures(); + + const targetText = mergeTarget.bestExample.text; + assert.equal(targetText, toMathematicalSMP("cello")); + + for(let i = 0; i < splits.length; i++) { + const splitSet = splits[i]; + + assert.equal(splitSet[0].codepointLength, i); + assert.equal(splitSet[0].bestExample.text, KMWString.substring(targetText, 0, i)); + assert.equal(splitSet[1].codepointLength, KMWString.length(targetText) - i); + assert.equal(splitSet[1].bestExample.text, KMWString.substring(targetText, i)); + } + }); + + it('splits properly at index 0', () => { + runCommonAssertions(0); + }); + + it('splits properly at index 1', () => { + runCommonAssertions(1); + }); + + it('splits properly at index 2', () => { + runCommonAssertions(2); + }); + + it('splits properly at index 3', () => { + runCommonAssertions(3); + }); + + it('splits properly at index 4', () => { + runCommonAssertions(4); + }); + + it('splits properly at index 5', () => { + runCommonAssertions(5); + }); + }); + + it('correctly merges paths previously split mid-input', () => { + let path: SearchQuotientNode = new LegacyQuotientRoot(testModel); + const startSample = {sample: { insert: 'a', deleteLeft: 0 }, p: 1} + path = new LegacyQuotientSpur(path, [startSample], startSample); + + const inputDistribution = [ + {sample: { insert: 'four', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.4}, + {sample: { insert: 'then', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.3}, + {sample: { insert: 'nine', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.2}, + {sample: { insert: 'what', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.06}, + {sample: { insert: 'cent', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.04} + ]; + + const mergeTarget = new LegacyQuotientSpur(path, inputDistribution, inputDistribution[0]); + assert.equal(mergeTarget.codepointLength, 4); + assert.equal(mergeTarget.inputCount, 2); + + // This test models a previous split at codepoint index 2, splitting + // the input distribution accordingly. (Note: deleteLeft = 1!) + const headDistributionSplit = [ + {sample: { insert: 'fo', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.4}, + {sample: { insert: 'th', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.3}, + {sample: { insert: 'ni', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.2}, + {sample: { insert: 'wh', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.06}, + {sample: { insert: 'ce', deleteLeft: 1, deleteRight: 0, id: 42 }, p: 0.04} + ]; + const headPath = new LegacyQuotientSpur( + path, headDistributionSplit, { + segment: { + start: 0, + trueTransform: inputDistribution[0].sample, + transitionId: inputDistribution[0].sample.id + }, + bestProbFromSet: inputDistribution[0].p, + subsetId: mergeTarget.inputSource.subsetId + } + ); + + const tailDistributionSplit = [ + {sample: { insert: 'ur', deleteLeft: 0, deleteRight: 0, id: 42 }, p: 0.4}, + {sample: { insert: 'en', deleteLeft: 0, deleteRight: 0, id: 42 }, p: 0.3}, + {sample: { insert: 'ne', deleteLeft: 0, deleteRight: 0, id: 42 }, p: 0.2}, + {sample: { insert: 'at', deleteLeft: 0, deleteRight: 0, id: 42 }, p: 0.06}, + {sample: { insert: 'nt', deleteLeft: 0, deleteRight: 0, id: 42 }, p: 0.04} + ]; + const tailPath = new LegacyQuotientSpur( + new LegacyQuotientRoot(testModel), tailDistributionSplit, { + segment: { + start: 2, + trueTransform: inputDistribution[0].sample, + transitionId: inputDistribution[0].sample.id + }, + bestProbFromSet: inputDistribution[0].p, + subsetId: mergeTarget.inputSource.subsetId + } + ); + + const remerged = headPath.merge(tailPath); + + assert.deepEqual(remerged.bestExample, mergeTarget.bestExample); + assert.equal(remerged.inputCount, 2); + assert.isTrue(remerged instanceof SearchQuotientSpur); + assert.deepEqual((remerged as SearchQuotientSpur).inputs, inputDistribution); + assert.isTrue(quotientPathHasInputs(remerged, [[startSample], inputDistribution])); + }); + }); }); \ No newline at end of file