diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/alignment-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/alignment-helpers.ts deleted file mode 100644 index 4f133da119e..00000000000 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/alignment-helpers.ts +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Keyman is copyright (C) SIL Global. MIT License. - * - * Created by jahorton on 2025-07-30 - * - * This file defines methods used as helpers when aligning cached context state - * information with incoming contexts and when validating partial substitution - * edits for aligned context tokens. - */ - -import { SENTINEL_CODE_UNIT } from '@keymanapp/models-templates'; -import { ClassicalDistanceCalculation, computeDistance, EditTuple } from "./classical-calculation.js"; -import { ExtendedEditOperation } from './segmentable-calculation.js'; - -/** - * Represents token-count values resulting from an alignment attempt between two - * different modeled context states. - */ -export type ContextStateAlignment = { - /** - * Denotes whether or not alignment is possible between two contexts. - */ - canAlign: false, - - /** - * Indicates the edit path that could not be handled. (Useful for error reporting) - * - * The edit path does not include actual user text and is sanitized. - */ - editPath: EditTuple[]; -} | { - /** - * Denotes whether or not alignment is possible between two contexts. - */ - canAlign: true, - - /** - * Indicates the edit path that could not be handled. (Useful for error reporting) - * - * The edit path does not include actual user text and is sanitized. - */ - editPath: EditTuple[]; - - /** - * Notes the number of tokens added to the head of the 'incoming'/'new' context - * of the contexts being aligned. If negative, the incoming context deleted - * a token found in the 'original' / base context. - * - * For the alignment, [base context index] + leadTokenShift = [incoming context index]. - */ - leadTokenShift: number, - /** - * Notes the number of tokens at the head of the 'incoming'/'new' context, - * perfectly aligned but edited for two successfully-alignable contexts. These - * tokens directly precede those that need no edits. - * - * When a token could be considered as either 'lead' or 'tail' edit, it will - * only be reported as a 'tail' edit. - */ - leadEditLength: number, - /** - * The count of tokens perfectly aligned, with no need for edits, for two successfully- - * alignable contexts. - */ - matchLength: number, - /** - * The count of tokens at the tail perfectly aligned (existing in both contexts) but - * edited for two successfully-alignable contexts. These tokens directly follow those - * that need no edits. - */ - tailEditLength: number, - /** - * The count of new tokens added at the end of the incoming context for two aligned contexts. - * If negative, the incoming context deleted a previously-existing token from the original. - */ - tailTokenShift: number -}; - - -/** - * Determines the proper 'last match' index for a tokenized sequence based on its edit path. - * - * In particular, this method is designed to handle the following cases: - * - ['to', ' ', 'apple', ' ', ''] => ['to', ' ', 'apply', ' ', ''] - * - ['to', ' ', 'apple', ' ', ''] => ['to', ' ', 'apply', ' ', 'n'] - * - * Edit path for these example cases: - * - ['match', 'match', 'substitute', 'match', 'match'] - * - ['match', 'match', 'substitute', 'match', 'substitute'] - * - * In cases such as these, the late whitespace match should be considered 'edited'. While the - * ' ' is unedited, it follows the edited 'apple' => 'apply', so it must have been deleted and - * then re-inserted. As a result, the whitespace after 'to' is the true "last matched" token. - * - * Returns -1 if an unexpected edit other than 'substitute' occurs in the middle of the big - * 'match' block. - * @param editPath - * @returns - */ -export function getEditPathLastMatch(editPath: ExtendedEditOperation[], forAppliedSuggestion?: boolean) { - // Assertion: for a long context, the bulk of the edit path should be a - // continuous block of 'match' entries. If there's anything but a substitution - // in the middle, we have a context mismatch. - // - // That said, it is possible to apply a suggestion after a backspace. Anything - // after the substitution needs to be treated as a substitution rather than - // a match. - const firstMatch = editPath.indexOf('match'); - const lastMatch = editPath.lastIndexOf('match'); - if(firstMatch > -1) { - for(let i = firstMatch+1; i <= lastMatch; i++) { - if(editPath[i] != 'match') { - // fun case: ' ' + ' applied ' has an unusual edit path. - // we get an 'insert'. - return ( - (editPath[i] == 'substitute') - || (forAppliedSuggestion && editPath[i] == 'insert') - ) ? (i - 1) : -1; - } - } - } - - return lastMatch; -} - -/** - * Aligns two tokens on a character-by-character basis as needed for higher, token-level alignment - * operations. - * @param incomingToken The incoming token value - * @param matchingToken The pre-existing token value to use for comparison and alignment - * @param forNearCaret If `false`, disallows any substitutions and activates a leading-edge alignment - * validation mode. - * @returns - */ -export function isSubstitutionAlignable( - incomingToken: string, - matchingToken: string, - forNearCaret?: boolean -): boolean { - // 1 - Determine the edit path for the word. - const subEditCalc = computeDistance( - // Use max length in case the word is actually already partly out of - // the sliding context window. - new ClassicalDistanceCalculation({ diagonalWidth: Math.max(incomingToken.length, matchingToken.length) }), - [...matchingToken], - [...incomingToken], - ); - let subEditPath = subEditCalc.editPath()[0].map(t => t.op); - - const firstInsert = subEditPath.indexOf('insert'); - const firstDelete = subEditPath.indexOf('delete'); - - // 2 - deletions and insertions should be mutually exclusive. - // A fixed, unedited word can't slide across both 'left' and 'right' boundaries at the same time. - if(firstInsert != -1 && firstDelete != -1) { - return false; - }; - - // 3 - checks exclusive to leading-edge conditions - if(!forNearCaret) { - const firstSubstitute = subEditPath.indexOf('substitute'); - const firstMatch = subEditPath.indexOf('match'); - if(firstSubstitute > -1) { - // When this is called for a word not adjacent to the caret, its letters shouldn't be - // substituted - that operation doesn't happen at a sliding context-window edge. - return false; - } else if(firstMatch > -1) { - const lastMatch = subEditPath.lastIndexOf('match'); - // Should not have inserts or deletes on both sides of matched text! - // Due to how the edit path is calculated, an insert or delete could appear after the - // firstMatch - especially in the case of adjacent double letters. - // - // Ex: applesauce => plesauce tends to say 'match', then 'delete', on the two 'p's. - if(firstInsert > -1 && firstInsert < firstMatch && subEditPath.lastIndexOf('insert') > lastMatch) { - return false; - } else if(firstDelete > -1 && firstDelete < firstMatch && subEditPath.lastIndexOf('delete') > lastMatch) { - return false; - } - } - - // Further checks below are oriented for text/tokens at the caret. - return true; - } - - // 4 - check the stats for total edits of each type and validate that edits don't overly exceed - // original characters. - const editCount = { - matchMove: 0, - rawEdit: 0 - }; - - subEditPath.forEach((entry) => { - switch(entry) { - case 'transpose-end': - case 'transpose-start': - case 'match': - editCount.matchMove++; - break; - case 'insert': - case 'transpose-insert': - case 'delete': - case 'transpose-delete': - case 'substitute': - editCount.rawEdit++; - } - }); - - // We shouldn't have more raw substitutions, inserts, and deletes than matches + transposes, - // though allowing +1 as a fudge factor. - // The 'a' => 'à' pattern can be a reasonably common Keyman keyboard rule and - // is one substitution, zero matches in NFC. - if(editCount.matchMove + 1 < editCount.rawEdit) { - return false; - } - - return true; -} - -/** - * Determines the alignment between a new, incoming tokenization source and the - * tokenization modeled by the current instance. - * @param tokenizationToMatch Raw strings corresponding to the tokenization of the original context - * @param incomingTokenization Raw strings corresponding to the tokenization of the incoming context - * @param isSliding Notes if the context window is full (and sliding-alignment is particularly needed) - * @param forAppliedSuggestion When true, this asserts that the contexts are alignable and loosens - * alignment requirements accordingly. - * @returns Alignment data that details if and how the incoming tokenization aligns with - * the tokenization modeled by this instance. - */ -export function computeAlignment( - tokenizationToMatch: string[], - incomingTokenization: string[], - isSliding: boolean, - forAppliedSuggestion?: boolean -): ContextStateAlignment { - const src = tokenizationToMatch; - const dst = incomingTokenization; - - // let changedEmptyTail = false; - if(dst[dst.length - 1] == '') { - // Only allow matching if the tokenizations are identical, thus the empty - // token was unaffected. - if(src.length != dst.length || src[dst.length - 1] != '') { - // Do not allow empty-token matches to match each other; this complicates - // things when applying zero-root suggestions. - // - // The SENTINEL char should never appear in raw text, thus should never - // match anything in the "tokenization to match". - dst[dst.length - 1] = SENTINEL_CODE_UNIT; - } - } - - // Inverted order, since 'match' existed before our new context. - const mapping = computeDistance( - // Diagonal width allows asymmetric edits and is also needed to cover - // difference in length for the inputs. We should try to cover at least 2 - // edits on one side in addition to potential length asymmetry. - new ClassicalDistanceCalculation({diagonalWidth: Math.abs(src.length - dst.length) + 3}), - src, - dst - ); - - // Later iteration: we could return this itself directly for use in alignment - // operations, rather than relying solely on the edit-op names. - let editPaths = mapping.editPath(); - if(editPaths.length == 0) { - console.error(`Could not compute edit path for aligning contexts of length ${src.length}, ${dst.length}`); - } - let editPath = editPaths[0].map(t => t.op); - - const failure: ContextStateAlignment = { - canAlign: false, - editPath: editPaths[0] - }; - - // Special case: new context bootstrapping - first token often substitutes. - // The text length is small enough that no words should be able to rotate out the start of the context. - // Special handling needed in case of no 'match'; the rest of the method assumes at least one 'match'. - if(editPath.length <= 3 && (editPath[0] == 'substitute' || editPath[0] == 'match')) { - let matchCount = 0; - let subCount = 0; - for(let i = 0; i < editPath.length; i++) { - if(editPath[i] == 'substitute') { - subCount++; - if(!forAppliedSuggestion && !isSubstitutionAlignable(incomingTokenization[i], tokenizationToMatch[i], true)) { - return failure; - } - } else if(editPath[i] == 'match') { - // If a substitution is already recorded, treat the 'match' as a substitution. - if(subCount > 0) { - subCount++; - } else { - matchCount++; - } - } - } - - const insertCount = editPath.filter((entry) => entry == 'insert').length; - const deleteCount = editPath.filter((entry) => entry == 'delete').length; - - return { - canAlign: true, - editPath: editPaths[0], - matchLength: matchCount, - leadTokenShift: 0, - leadEditLength: 0, - tailEditLength: subCount, - tailTokenShift: insertCount - deleteCount - } - } - - // From here on assumes that at least one 'match' exists on the path. - // It all works great... once the context is long enough for at least one stable token. - const firstMatch = editPath.indexOf('match'); - - if(firstMatch == -1) { - // If there are no matches, there's no alignment. - return failure; - } - - // Transpositions are not allowed at the token level during context alignment. - if(editPath.find((entry) => entry.indexOf('transpose') > -1)) { - return failure; - } - - const lastMatch = getEditPathLastMatch(editPath, forAppliedSuggestion); - - // Assertion: for a long context, the bulk of the edit path should be a - // continuous block of 'match' entries. If there's anything else in - // the middle, we have a context mismatch. - if(lastMatch == -1) { - return failure; - } - - let matchLength = lastMatch - firstMatch + 1; - let tailInsertLength = 0; - let tailDeleteLength = 0; - for(let i = lastMatch; i < editPath.length; i++) { - if(editPath[i] == 'insert') { - tailInsertLength++; - } else if(editPath[i] == 'delete') { - tailDeleteLength++; - } - } - if(tailInsertLength > 0 && tailDeleteLength > 0) { - // Something's gone weird if this happens; that should appear as a substitution instead. - // Otherwise, we have a VERY niche edit scenario. - return failure; - } - const tailSubstituteLength = (editPath.length - 1 - lastMatch) - tailInsertLength - tailDeleteLength; - - // If we have a perfect match with a pre-existing context, no mutations have - // happened; we have a 100% perfect match. - if(firstMatch == 0 && lastMatch == editPath.length - 1) { - return { - canAlign: true, - editPath: editPaths[0], - leadTokenShift: 0, - leadEditLength: 0, - matchLength, - tailEditLength: tailSubstituteLength, - tailTokenShift: tailInsertLength - tailDeleteLength - }; - } - - // The edit path calc tries to put substitutes first, before inserts. - // We don't want that on the leading edge. - const lastEarlyInsert = editPath.lastIndexOf('insert', firstMatch); - const firstSubstitute = editPath.indexOf('substitute'); - if(firstSubstitute > -1 && firstSubstitute < firstMatch && firstSubstitute < lastEarlyInsert) { - editPath[firstSubstitute] = 'insert'; - editPath[lastEarlyInsert] = 'substitute'; - } - - // If mutations HAVE happened, we need to double-check the context-state alignment. - let priorEdit: typeof editPath[0]; - let leadTokensRemoved = 0; - let leadSubstitutions = 0; - - // The `i` index below aligns based upon the index within the `tokenizationToMatch` sequence - // and how it would have to be edited to align to the `incomingTokenization` sequence. - for(let i = 0; i < firstMatch; i++) { - switch(editPath[i]) { - case 'delete': - // All deletions should appear at the sliding window edge; if a deletion appears - // after the edge, but before the first match, something's wrong. - if(priorEdit && priorEdit != 'delete') { - return failure; - } - leadTokensRemoved++; - break; - case 'substitute': - // Find the word before and after substitution. - const incomingIndex = i - (leadTokensRemoved > 0 ? leadTokensRemoved : 0); - const matchingIndex = i + (leadTokensRemoved < 0 ? leadTokensRemoved : 0); - const incomingSub = incomingTokenization[incomingIndex]; - const matchingSub = tokenizationToMatch[matchingIndex]; - - const atSlidePoint = isSliding && (incomingIndex == 0 || matchingIndex == 0); - - // Double-check the word - does the 'substituted' word itself align? - // - // Exception: if the word is at the start of the context window and the - // context window is likely sliding, don't check it. - if(!forAppliedSuggestion && !atSlidePoint && !isSubstitutionAlignable(incomingSub, matchingSub)) { - return failure; - } - - leadSubstitutions++; - break; - case 'insert': - // Only allow an insert at the leading edge, as with 'delete's. - if(priorEdit && priorEdit != 'insert') { - return failure; - } - // In case of backspaces, it's also possible to 'insert' a 'new' - // token - an old one that's slid back into view. - leadTokensRemoved--; - break; - default: - // No 'match' can exist before the first found index for a 'match'. - // No 'transpose-' edits should exist within this section, either. - return failure; - } - priorEdit = editPath[i]; - } - - // If we need some form of tail-token substitution verification, add that here. - - return { - canAlign: true, - editPath: editPaths[0], - // leadTokensRemoved represents the number of tokens that must be removed from the base context - // when aligning the contexts. Externally, it's more helpful to think in terms of the count added - // to the incoming context. - leadTokenShift: -leadTokensRemoved + 0, // add 0 in case of a 'negative zero', which affects unit tests. - leadEditLength: leadSubstitutions, - matchLength, - tailEditLength: tailSubstituteLength, - tailTokenShift: tailInsertLength - tailDeleteLength - }; -} \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/index.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/index.ts index 6da63b7ced6..33d9f70f54c 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/index.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/index.ts @@ -1,5 +1,4 @@ export * from './classical-calculation.js'; export * from './context-tracker.js'; export * from './distance-modeler.js'; -export * from './execution-timer.js'; -export * from './transform-tokenization.js'; \ No newline at end of file +export * from './execution-timer.js'; \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/transform-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/transform-tokenization.ts deleted file mode 100644 index 2e22370d6f6..00000000000 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/transform-tokenization.ts +++ /dev/null @@ -1,189 +0,0 @@ -import { LexicalModelTypes } from '@keymanapp/common-types'; -import { applyTransform, type Tokenization } from "@keymanapp/models-templates"; - -import { determineModelTokenizer } from '../model-helpers.js'; - -import Context = LexicalModelTypes.Context; -import Distribution = LexicalModelTypes.Distribution; -import LexicalModel = LexicalModelTypes.LexicalModel; -import Transform = LexicalModelTypes.Transform; -import { computeAlignment } from './alignment-helpers.js'; -import { KMWString } from '@keymanapp/web-utils'; - -/** - * Determines a tokenization-aware sequence of (`Transform`) edits, one per - * token that would result after applying the incoming keystroke's `Transform` - * to its base `Context`. This sequence reproduces the same net effect as the - * original incoming `transform` when applied in sequence. The component - * transforms are then indexed relative to the position of the corresponding - * token in the base `Context`, with 0 matching the original token left of - * the text insertion point, negative indices affecting previous tokens, and - * positive indicies affecting new tokens. - * - * For example, using English and standard whitespace-based tokenization: - * - context: `the quick blue` - * - transform: `{ insert: 'rown fox', deleteLeft: 3 }` - * - Resulting context: `the quick brown fox` - * - Output: `Map { - * 0 => { insert: 'rown', deleteLeft: 3 }, - * 1 => { insert: ' ', deleteLeft: 0 }, - * 2 => { insert: 'fox', deleteLeft: 0 } - * }` - * @param tokenize The tokenization function to utilize, as determined by the - * active lexical model or its settings. - * @param context The original, unmodified context - * @param transform A specification of incoming edits to `context`. - * @returns - */ -export function tokenizeTransform( - tokenize: (context: Context) => Tokenization, - context: Context, - transform: Transform -): Map { - if(transform.insert == '' && transform.deleteLeft == 0) { - const map = new Map(); - map.set(0, { insert: '', deleteLeft: 0 }); - return map; - } - - // Context does not slide within this function. - const postContext = applyTransform(transform, context); - const preTokenization = tokenize(context).left; - const postTokenization = tokenize(postContext).left; - if(preTokenization.length == 0) { - preTokenization.push({text: ''}); - } - if(postTokenization.length == 0) { - postTokenization.push({text: ''}); - } - - const alignment = computeAlignment( - preTokenization.map(t => t.text), - postTokenization.map(t => t.text), - false, - true - ); - - if(!alignment.canAlign || alignment.leadTokenShift) { - throw new Error(`Could not align context ${JSON.stringify(context)} before and after transform ${JSON.stringify(transform)}`); - } - - let deleteLeft = transform.deleteLeft; - const tokenizedTransforms: Transform[] = []; - // Create deletion transforms for deleted ... - for(let index = -alignment.tailTokenShift; index > 0; index--) { - const deletedToken = preTokenization.pop(); - const srcLen = KMWString.length(deletedToken.text); - deleteLeft -= KMWString.length(deletedToken.text); - - tokenizedTransforms.push({ insert: '', deleteLeft: srcLen }); - } - - let insert = transform.insert; - // Avoid emitting an empty transform if we land right at the end of a previous - // token (say, after a backspace) - if(insert.length > 0 || deleteLeft > 0) { - for(let index = postTokenization.length - 1; index >= 0; index--) { - const currentToken = postTokenization[index]; - const srcLen = KMWString.length(preTokenization[index]?.text ?? ''); - const curLen = KMWString.length(currentToken.text); - - if(srcLen >= deleteLeft && curLen >= KMWString.length(insert)) { - tokenizedTransforms.push({ - insert: insert, - deleteLeft: deleteLeft - }); - break; - } - - insert = insert.substring(0, insert.length - currentToken.text.length); - deleteLeft = Math.max(0, deleteLeft - srcLen); - tokenizedTransforms.push({ - insert: currentToken.text, - deleteLeft: srcLen - }); - } - } - - const returnedMap = new Map(); - - // We can very easily compute the final index that should appear in the math - - // what's the difference in tokenization length? - const finalIndex = postTokenization.length - preTokenization.length; - // We pushed the tokenizations onto a stack such that we pop them from - // early-context to late-context; we need to count up when indexing. - const baseIndex = finalIndex - tokenizedTransforms.length + 1; - let pushedCount: number = 0; - while(tokenizedTransforms.length > 0) { - returnedMap.set(baseIndex + pushedCount, tokenizedTransforms.pop()); - pushedCount++; - } - - return returnedMap; -} - -/** - * Given an incoming distribution of Transforms, this method applies - * `tokenizeTransform` for each, mapping each transform to its tokenized form in - * the returned distribution. - * - * It is believed that this may prove useful in the future for phrase-based - * suggestions and/or auto-correction of accidentally-typed whitespace. For - * now, it already sees limited use in preventing replacement of word-adjacent - * punctuation marks. - * @param tokenize - * @param context - * @param transformDistribution - * @returns - */ -export function tokenizeTransformDistribution( - tokenize: (context: Context) => Tokenization, - context: Context, - transformDistribution: Distribution -): Distribution> { - return transformDistribution.map((transform) => { - return { - sample: tokenizeTransform(tokenize, context, transform.sample), - p: transform.p - }; - }); -} - -/** - * Given an incoming distribution of Transforms, this method applies - * `tokenizeTransform` for each, mapping each transform to its tokenized form in - * the returned distribution. - * - * It then filters out all incoming Transforms that do not result in the same final - * number of tokens as the "primary input" when applied, as the context-tracker - * and predictive-text engine cannot handle word-breaking divergence well at - * this time. - * @param context - * @param model - * @param transformDistribution - * @returns - */ -export function tokenizeAndFilterDistribution( - context: Context, - model: LexicalModel, - transformDistribution?: Distribution -): Distribution> { - let tokenize = determineModelTokenizer(model); - const inputTransform = transformDistribution?.[0]; - - if(!inputTransform) { - return null; - } - - // These two methods apply transforms internally; do not mutate context here. - // This particularly matters for the 'distribution' variant. - const tokenizedInputTransform = tokenizeTransform(tokenize, context, inputTransform.sample); - const lastTokenizedInputIndex = [...tokenizedInputTransform.keys()].reverse()[0]; - const tokenizedDistribution = tokenizeTransformDistribution(tokenize, context, transformDistribution); - - // While we lack phrase-based / phrase-oriented prediction support, we'll just extract the - // set that matches the token length that results from our input. - return tokenizedDistribution.filter((entry) => - entry.sample.has(lastTokenizedInputIndex) && !entry.sample.has(lastTokenizedInputIndex + 1) - ); -} \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts index 4b40891a17e..a65c99506e6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts @@ -4,14 +4,12 @@ export * from './correction/context-token.js'; export * from './correction/context-tokenization.js'; export { ContextTracker } from './correction/context-tracker.js'; export { ContextTransition } from './correction/context-transition.js'; -export * from './correction/alignment-helpers.js'; export { ExtendedEditOperation, SegmentableDistanceCalculation } from './correction/segmentable-calculation.js'; export * from './correction/tokenization-subsets.js'; export * as correction from './correction/index.js'; export * from './model-helpers.js'; export * as models from './models/index.js'; export { ModelCompositor } from './model-compositor.js'; -export * from './correction/transform-tokenization.js'; export * from './predict-helpers.js'; export { default as TransformUtils } from './transformUtils.js' export { default as LMLayerWorker } from './index.js' diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/alignment-helpers.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/alignment-helpers.tests.ts deleted file mode 100644 index 4872d317668..00000000000 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/alignment-helpers.tests.ts +++ /dev/null @@ -1,892 +0,0 @@ -/* - * Keyman is copyright (C) SIL Global. MIT License. - * - * Created by jahorton on 2025-07-30 - * - * This file contains low-level tests designed to validate helper functions - * used when aligning cached context states to incoming contexts and when - * validating potential substitution edit operations. - */ - -import { assert } from 'chai'; -import { computeAlignment, EditOperation, getEditPathLastMatch, isSubstitutionAlignable } from '@keymanapp/lm-worker/test-index'; - -describe('getEditPathLastMatch', () => { - it('returns the last match when no substitutions exist', () => { - const path: EditOperation[] = ['delete', 'delete', 'match', 'match', 'match', 'match', 'insert']; - assert.equal(path.lastIndexOf('match'), 5); - assert.equal(getEditPathLastMatch(path), 5); - }); - - it('returns the last match when no substitutions exist left of a "match"', () => { - const path: EditOperation[] = ['delete', 'delete', 'match', 'match', 'match', 'match', 'substitute', 'insert']; - assert.equal(path.lastIndexOf('match'), 5); - assert.equal(getEditPathLastMatch(path), 5); - }); - - // is intended to handle application of suggestions. - it('returns the last match before a substitute occurring after the first match', () => { - // limitation: if there is _anything_ after that last match, the first assertion will fail. - // 0 1 2 3 4 5 6 - const path: EditOperation[] = ['delete', 'delete', 'match', 'match', 'substitute', 'match', 'match']; - assert.notEqual(getEditPathLastMatch(path), 6); - assert.equal(getEditPathLastMatch(path), 3); - }); - - // is intended to handle complex transforms that include a whitespace and affect prior tokens. - it('returns the last match before a substitute occurring after the first match', () => { - // limitation: if there is _anything_ after that last match, the first assertion will fail. - // 0 1 2 3 4 5 6 - const path: EditOperation[] = ['delete', 'delete', 'match', 'match', 'substitute', 'match', 'substitute']; - assert.notEqual(getEditPathLastMatch(path), 5); - assert.equal(getEditPathLastMatch(path), 3); - }); -}); - -describe('isSubstitutionAlignable', () => { - it(`returns true: 'ca' => 'can'`, () => { - assert.isTrue(isSubstitutionAlignable('can', 'ca')); - }); - - // Leading word in context window starts sliding out of said window. - it(`returns true: 'can' => 'an'`, () => { - assert.isTrue(isSubstitutionAlignable('an', 'can')); - }); - - // Same edits on both sides: not valid. - it(`returns false: 'apple' => 'grapples'`, () => { - assert.isFalse(isSubstitutionAlignable('grapples', 'apple')); - }); - - // Edits on one side: valid. - it(`returns true: 'apple' => 'grapple'`, () => { - assert.isTrue(isSubstitutionAlignable('grapple', 'apple')); - }); - - // Edits on one side: valid. - it(`returns true: 'apple' => 'grapple'`, () => { - assert.isTrue(isSubstitutionAlignable('apples', 'apple')); - }); - - // Same edits on both sides: not valid. - it(`returns false: 'grapples' => 'apple'`, () => { - assert.isFalse(isSubstitutionAlignable('apple', 'grapples')); - }); - - // Substitution: not valid when not permitted via parameter. - it(`returns false: 'apple' => 'banana'`, () => { - // edit path: 'insert' ('b' of banana), 'match' (on leading a), rest are 'substitute'. - assert.isFalse(isSubstitutionAlignable('banana', 'apple')); - }); - - // Substitution: not valid if too much is substituted, even if allowed via parameter. - it(`returns false: 'apple' => 'banana' (subs allowed)`, () => { - // edit path: 'insert' ('b' of banana), 'match' (on leading a), rest are 'substitute'. - // 1 match vs 4 substitute = no bueno. It'd require too niche of a keyboard rule. - assert.isFalse(isSubstitutionAlignable('banana', 'apple', true)); - }); - - it(`returns true: 'a' => 'à' (subs allowed)`, () => { - assert.isTrue(isSubstitutionAlignable('à', 'a', true)); - }); - - // Leading substitution: valid if enough of the remaining word matches. - // Could totally happen from a legit Keyman keyboard rule. - it(`returns true: 'can' => 'van' (subs allowed)`, () => { - assert.isTrue(isSubstitutionAlignable('van', 'can', true)); - }); - - // Trailing substitution: invalid if not allowed. - it(`returns false: 'can' => 'cap' (subs not allowed)`, () => { - assert.isFalse(isSubstitutionAlignable('cap', 'can')); - }); - - // Trailing substitution: valid. - it(`returns false: 'can' => 'cap' (subs allowed)`, () => { - assert.isTrue(isSubstitutionAlignable('cap', 'can', true)); - }); - - it(`returns true: 'clasts' => 'clasps' (subs allowed)`, () => { - assert.isTrue(isSubstitutionAlignable('clasps', 'clasts', true)); - }); - - // random deletion at the start + later substitution = still permitted - it(`returns false: 'clasts' => 'lasps' (subs allowed)`, () => { - assert.isTrue(isSubstitutionAlignable('lasps', 'clasts', true)); - }); - - // deletion, then sub at the start, duplicate letters with one dropped - it(`returns true: 'applesauce' => 'plesauce' (subs not allowed)`, () => { - // The double-p adds a fun complication once the first gets dropped. - assert.isTrue(isSubstitutionAlignable('applesauce', 'plesauce')); - }); -}); - - -describe('computeAlignment', () => { - it("properly matches and aligns when contexts match", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [...baseContext]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 5, - tailEditLength: 0, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns with applied-suggestion contexts", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'o' - ]; - const newContext = [...baseContext]; - newContext[4] = 'over'; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'substitute', input: 4, match: 4} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 4, - tailEditLength: 1, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns with applied-suggestion at start of context", () => { - const baseContext = [ - 'te' - ]; - const newContext = [ - 'testing', - ' ', - '' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false, true); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'insert', match: 1}, - {op: 'insert', match: 2} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 0, - tailEditLength: 1, - tailTokenShift: 2 - }); - }); - - it("detects unalignable contexts - no matching tokens", () => { - const baseContext = [ - 'swift', 'tan', 'wolf', 'leaped', 'across' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - { op: 'substitute', input: 0, match: 0 }, - { op: 'substitute', input: 1, match: 1 }, - { op: 'substitute', input: 2, match: 2 }, - { op: 'substitute', input: 3, match: 3 }, - { op: 'substitute', input: 4, match: 4 } - ] - }); - }); - - it("detects unalignable contexts - too many mismatching tokens", () => { - const baseContext = [ - 'swift', 'tan', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'substitute', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4} - ], - }); - }); - - it("fails alignment for leading-edge word substitutions", () => { - const baseContext = [ - 'swift', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4} - ] - }); - }); - - it("fails alignment for small leading-edge word substitutions", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'sick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4} - ] - }); - }); - - it("properly matches and aligns when lead token is modified", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'uick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 4, - tailEditLength: 0, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns when lead token is removed", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'match', input: 1, match: 0}, - {op: 'match', input: 2, match: 1}, - {op: 'match', input: 3, match: 2}, - {op: 'match', input: 4, match: 3} - ], - leadTokenShift: -1, - leadEditLength: 0, - matchLength: 4, - tailEditLength: 0, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns when lead token is added", () => { - const baseContext = [ - 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'insert', match: 0}, - {op: 'match', input: 0, match: 1}, - {op: 'match', input: 1, match: 2}, - {op: 'match', input: 2, match: 3}, - {op: 'match', input: 3, match: 4} - ], - leadTokenShift: 1, - leadEditLength: 0, - matchLength: 4, - tailEditLength: 0, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns when lead tokens are removed and modified", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'ox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'delete', input: 1}, - {op: 'substitute', input: 2, match: 0}, - {op: 'match', input: 3, match: 1}, - {op: 'match', input: 4, match: 2}, - ], - leadTokenShift: -2, - leadEditLength: 1, - matchLength: 2, - tailEditLength: 0, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns when lead tokens are added and modified", () => { - const baseContext = [ - 'rown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'insert', match: 0}, - {op: 'substitute', input: 0, match: 1}, - {op: 'match', input: 1, match: 2}, - {op: 'match', input: 2, match: 3}, - {op: 'match', input: 3, match: 4}, - ], - leadTokenShift: 1, - leadEditLength: 1, - matchLength: 3, - tailEditLength: 0, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns when lead token is removed and tail token is added", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'brown', 'fox', 'jumped', 'over', 'the' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'match', input: 1, match: 0}, - {op: 'match', input: 2, match: 1}, - {op: 'match', input: 3, match: 2}, - {op: 'match', input: 4, match: 3}, - {op: 'insert', match: 4} - ], - leadTokenShift: -1, - leadEditLength: 0, - matchLength: 4, - tailEditLength: 0, - tailTokenShift: 1 - }); - }); - - it("properly matches and aligns when lead token and tail token are modified", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'ove' - ]; - const newContext = [ - 'uick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'substitute', input: 4, match: 4} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 3, - tailEditLength: 1, - tailTokenShift: 0 - }); - }); - - it("properly matches and aligns when lead token and tail token are modified + new token appended", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'ove' - ]; - const newContext = [ - 'uick', 'brown', 'fox', 'jumped', 'over', 't' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'substitute', input: 4, match: 4}, - {op: 'insert', match: 5} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 3, - tailEditLength: 1, - tailTokenShift: 1 - }); - }); - - it("properly handles context window sliding backward", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'e', 'quick', 'brown', 'fox', 'jumped', 'ove' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'insert', match: 0}, - {op: 'match', input: 0, match: 1}, - {op: 'match', input: 1, match: 2}, - {op: 'match', input: 2, match: 3}, - {op: 'match', input: 3, match: 4}, - {op: 'substitute', input: 4, match: 5} - ], - leadTokenShift: 1, - leadEditLength: 0, - matchLength: 4, - tailEditLength: 1, - tailTokenShift: 0 - }); - }); - - it("properly handles context window sliding far backward", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'the', 'quick', 'brown', 'fox', 'jumped' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'insert', match: 0}, - {op: 'match', input: 0, match: 1}, - {op: 'match', input: 1, match: 2}, - {op: 'match', input: 2, match: 3}, - {op: 'match', input: 3, match: 4}, - {op: 'delete', input: 4} - ], - leadTokenShift: 1, - leadEditLength: 0, - matchLength: 4, - tailEditLength: 0, - tailTokenShift: -1 - }); - }); - - it("properly handles context window sliding farther backward", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'the', 'quick', 'brown', 'fox', 'jumpe' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'insert', match: 0}, - {op: 'match', input: 0, match: 1}, - {op: 'match', input: 1, match: 2}, - {op: 'match', input: 2, match: 3}, - {op: 'substitute', input: 3, match: 4}, - {op: 'delete', input: 4} - ], - leadTokenShift: 1, - leadEditLength: 0, - matchLength: 3, - tailEditLength: 1, - tailTokenShift: -1 - }); - }); - - it("fails alignment for mid-head deletion", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'delete', input: 1}, - {op: 'match', input: 2, match: 1}, - {op: 'match', input: 3, match: 2}, - {op: 'match', input: 4, match: 3} - ] - }); - }); - - it("fails alignment for mid-head insertion", () => { - const baseContext = [ - 'quick', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'insert', match: 1}, - {op: 'match', input: 1, match: 2}, - {op: 'match', input: 2, match: 3}, - {op: 'match', input: 3, match: 4} - ] - }); - }); - - it("fails alignment for mid-tail deletion", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'delete', input: 3}, - {op: 'match', input: 4, match: 3} - ] - }); - }); - - it("fails alignment for mid-tail insertion", () => { - const baseContext = [ - 'quick', 'brown', 'fox', 'jumped', 'over' - ]; - const newContext = [ - 'quick', 'brown', 'fox', 'jumped', 'far', 'over' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: false, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'insert', match: 4}, - {op: 'match', input: 4, match: 5} - ] - }); - }); - - it("handles late-context suggestion application after backspace", () => { - const baseContext = [ - 'quick', ' ', 'brown', ' ', 'fox', ' ', 'jumped', ' ', 'oven', ' ', '' - ]; - const newContext = [ - 'quick', ' ', 'brown', ' ', 'fox', ' ', 'jumped', ' ', 'over', ' ', '' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'substitute', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'match', input: 10, match: 10} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 8, - tailEditLength: 3, - tailTokenShift: 0 - }); - }); - - it("handles late-context application of default suggestion", () => { - const baseContext = [ - 'quick', ' ', 'brown', ' ', 'fox', ' ', 'jumped', ' ', 'over', ' ', '' - ]; - const newContext = [ - 'quick', ' ', 'brown', ' ', 'fox', ' ', 'jumped', ' ', 'over', ' ', 'the', ' ', '' - ]; - - const computedAlignment = computeAlignment(baseContext, newContext, false); - - assert.deepEqual(computedAlignment, { - canAlign: true, - editPath: [ - {op: 'match', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'match', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'substitute', input: 10, match: 10}, - {op: 'insert', match: 11}, - {op: 'insert', match: 12} - ], - leadTokenShift: 0, - leadEditLength: 0, - matchLength: 10, - tailEditLength: 1, - tailTokenShift: 2 - }); - }); - - it("handles sliding context-window scenarios", () => { - // // Explicitly-defined window, though it's not needed directly by the method. - // const config = { - // leftContextCodePoints: 64, - // rightContextCodePoints: 64 - // }; - - const baseContext1 = [ - // "ap" prefix not in actual view, but preserved by prior tokenization rounds. - "applesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", - "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "be" - ]; - - const incomingContext1 = [ - "plesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", - "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "bes" - ]; - - // 66 chars above, vs a sliding window of length 64. - assert.equal(baseContext1.reduce((accum, curr) => accum + curr.length, 0), 66); - // Actual window + one newly-typed character - assert.equal(incomingContext1.reduce((accum, curr) => accum + curr.length, 0), 65); - - assert.deepEqual(computeAlignment(baseContext1, incomingContext1, true), { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'match', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'match', input: 10, match: 10}, - {op: 'match', input: 11, match: 11}, - {op: 'match', input: 12, match: 12}, - {op: 'match', input: 13, match: 13}, - {op: 'match', input: 14, match: 14}, - {op: 'match', input: 15, match: 15}, - {op: 'match', input: 16, match: 16}, - {op: 'match', input: 17, match: 17}, - {op: 'match', input: 18, match: 18}, - {op: 'match', input: 19, match: 19}, - {op: 'match', input: 20, match: 20}, - {op: 'match', input: 21, match: 21}, - {op: 'substitute', input: 22, match: 22} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 21, - tailEditLength: 1, - tailTokenShift: 0 - }); - - // Our tokenization scheme remembers the full original word before any of it slid out of - // the context window. - const baseContext2 = [ - "applesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", - // +2 +1 +4 - "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "best", " ", "brea" - ]; - - const incomingContext2 = [ - // "plesauce" => "e": -7 chars. - "e", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", - "like", " ", "they'd", " ", "make", " ", "for", " ", "the", " ", "best", " ", "break" - ]; - - // 73 chars above, vs a sliding window of length 64. - assert.equal(baseContext2.reduce((accum, curr) => accum + curr.length, 0), 73); - // Actual window + one newly-typed character - assert.equal(incomingContext2.reduce((accum, curr) => accum + curr.length, 0), 65); - - assert.deepEqual(computeAlignment(baseContext2, incomingContext2, true), { - canAlign: true, - editPath: [ - {op: 'substitute', input: 0, match: 0}, - {op: 'match', input: 1, match: 1}, - {op: 'match', input: 2, match: 2}, - {op: 'match', input: 3, match: 3}, - {op: 'match', input: 4, match: 4}, - {op: 'match', input: 5, match: 5}, - {op: 'match', input: 6, match: 6}, - {op: 'match', input: 7, match: 7}, - {op: 'match', input: 8, match: 8}, - {op: 'match', input: 9, match: 9}, - {op: 'match', input: 10, match: 10}, - {op: 'match', input: 11, match: 11}, - {op: 'match', input: 12, match: 12}, - {op: 'match', input: 13, match: 13}, - {op: 'match', input: 14, match: 14}, - {op: 'match', input: 15, match: 15}, - {op: 'match', input: 16, match: 16}, - {op: 'match', input: 17, match: 17}, - {op: 'match', input: 18, match: 18}, - {op: 'match', input: 19, match: 19}, - {op: 'match', input: 20, match: 20}, - {op: 'match', input: 21, match: 21}, - {op: 'match', input: 22, match: 22}, - {op: 'match', input: 23, match: 23}, - {op: 'substitute', input: 24, match: 24} - ], - leadTokenShift: 0, - leadEditLength: 1, - matchLength: 23, - tailEditLength: 1, - tailTokenShift: 0 - }); - - const baseContext3 = [ - "applesauce", " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", "like", " ", - "they'd", " ", "make", " ", "for", " ", "the", " ", "best", " ", "break" - ]; - - const incomingContext3 = [ - " ", "and", " ", "orange", " ", "juice", " ", "don't", " ", "seem", " ", "like", " ", - "they'd", " ", "make", " ", "for", " ", "the", " ", "best", " ", "breakf" - ]; - - // 74 chars above, vs a sliding window of length 64. - assert.equal(baseContext3.reduce((accum, curr) => accum + curr.length, 0), 74); - // Actual window + one newly-typed character - assert.equal(incomingContext3.reduce((accum, curr) => accum + curr.length, 0), 65); - - assert.deepEqual(computeAlignment(baseContext3, incomingContext3, true), { - canAlign: true, - editPath: [ - {op: 'delete', input: 0}, - {op: 'match', input: 1, match: 0}, - {op: 'match', input: 2, match: 1}, - {op: 'match', input: 3, match: 2}, - {op: 'match', input: 4, match: 3}, - {op: 'match', input: 5, match: 4}, - {op: 'match', input: 6, match: 5}, - {op: 'match', input: 7, match: 6}, - {op: 'match', input: 8, match: 7}, - {op: 'match', input: 9, match: 8}, - {op: 'match', input: 10, match: 9}, - {op: 'match', input: 11, match: 10}, - {op: 'match', input: 12, match: 11}, - {op: 'match', input: 13, match: 12}, - {op: 'match', input: 14, match: 13}, - {op: 'match', input: 15, match: 14}, - {op: 'match', input: 16, match: 15}, - {op: 'match', input: 17, match: 16}, - {op: 'match', input: 18, match: 17}, - {op: 'match', input: 19, match: 18}, - {op: 'match', input: 20, match: 19}, - {op: 'match', input: 21, match: 20}, - {op: 'match', input: 22, match: 21}, - {op: 'match', input: 23, match: 22}, - {op: 'substitute', input: 24, match: 23} - ], - leadTokenShift: -1, - leadEditLength: 0, - matchLength: 23, - tailEditLength: 1, - tailTokenShift: 0 - }); - }); -}); \ No newline at end of file diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/transform-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/transform-tokenization.tests.ts deleted file mode 100644 index de9c5d88977..00000000000 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/transform-tokenization.tests.ts +++ /dev/null @@ -1,635 +0,0 @@ -import { assert } from 'chai'; - -import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; -import { Token, Tokenization, tokenize } from '@keymanapp/models-templates'; -import { LexicalModelTypes } from '@keymanapp/common-types'; - -import { tokenizeTransform } from '@keymanapp/lm-worker/test-index'; - -import Context = LexicalModelTypes.Context; -import Transform = LexicalModelTypes.Transform; - -const defaultTokenize = (context: Context) => tokenize(defaultBreaker, context); - -describe('tokenizeTransform', () => { - describe('with default wordbreaking', () => { - it('produces a single empty transform at index 0 when an empty transform is input', () => { - const context: Context = { - left: 'an apple a date', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: '', - deleteLeft: 0 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - assert.equal(result.size, 1); - assert.deepEqual(result.get(0), editTransform); - }); - - it('properly handles simple token-edit transform', () => { - const context: Context = { - left: 'an apple a date', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'y', - deleteLeft: 2 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - assert.equal(result.size, 1); - assert.deepEqual(result.get(0), editTransform); - }); - - it('properly handles simple token-replacing transform', () => { - const context = { - left: 'an apple a date', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'week', - deleteLeft: 4 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - assert.equal(result.size, 1); - assert.deepEqual(result.get(0), editTransform); - }); - - it('handles simple token-replacing transform with cross-token deleteLeft', () => { - const context = { - left: 'an apple a date', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // 'an apple any' - const editTransform = { - insert: 'ny', - deleteLeft: 5 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(-2, { - insert: 'ny', - deleteLeft: 0 - }); - expectedMap.set(-1, { - insert: '', - deleteLeft: 1 - }); - expectedMap.set(0, { - insert: '', - deleteLeft: 4 - }); - - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('properly handles a simple appended whitespace', () => { - const context = { - left: 'an apple a day', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: ' ', - deleteLeft: 0 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - // The whitespace belongs on the whitespace token that will be added. - expectedMap.set(1, editTransform); - // The default-breaker adds an empty token after whitespace, hence this - // empty transform. - expectedMap.set(2, { insert: '', deleteLeft: 0 }); - - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('properly handles a simple appended period', () => { - const context = { - left: 'an apple a day', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: '.', - deleteLeft: 0 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - // The default wordbreaker does not (currently) append a blank token - // after standard English punctuation. - const expectedMap = new Map(); - expectedMap.set(1, editTransform); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('properly deletes a simple appended whitespace', () => { - const context = { - left: 'an apple a day ', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: '', - deleteLeft: 1 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - // The whitespace belongs on the whitespace token that will be added. - expectedMap.set(-1, editTransform); - // The default-breaker adds an empty token after whitespace, hence this - // empty transform. - expectedMap.set(0, { insert: '', deleteLeft: 0 }); - - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles word-breakable transforms (case 1)', () => { - const context = { - left: 'an apple a dat', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'y k', - deleteLeft: 1 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - // dat => day - expectedMap.set(0, { insert: 'y', deleteLeft: 1 }); - // new whitespace - expectedMap.set(1, { insert: ' ', deleteLeft: 0 }); - // new 'k' token - expectedMap.set(2, { insert: 'k', deleteLeft: 0 }); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles word-breakable transforms (case 2)', () => { - const context = { - left: 'an apple a dat', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'y. ', - deleteLeft: 1 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, { insert: 'y', deleteLeft: 1 }); - expectedMap.set(1, { insert: '.', deleteLeft: 0 }); - expectedMap.set(2, { insert: ' ', deleteLeft: 0 }); - expectedMap.set(3, { insert: '', deleteLeft: 0 }); - assert.equal(result.size, 4); - assert.deepEqual(result, expectedMap); - }); - - it('handles complex breakable cases', () => { - const context = { - left: 'an apple as date', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // 'an apple any' - const editTransform = { - insert: 'ny day', - deleteLeft: 6 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - // as => any - expectedMap.set(-2, { insert: 'ny', deleteLeft: 1 }); // 2 back from the last token before the text insertion point. - // ' ' replaced with another ' ' (but still edited) - expectedMap.set(-1, { insert: ' ', deleteLeft: 1 }); - // date => day, but with full replacement due to the large deleteLeft. - expectedMap.set( 0, { insert: 'day', deleteLeft: 4 }); // The original token before the text insertion point. - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('properly aligns tokenization of transforms that match-replace existing tokens (1)', () => { - const context = { - left: 'properly', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // Case: the user had input a backspace and then selected a suggestion that restored - // the original word (which also appended whitespace). - const editTransform = { - insert: 'properly ', - deleteLeft: 8 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, { insert: 'properly', deleteLeft: 8 }); - expectedMap.set(1, { insert: ' ', deleteLeft: 0 }); - expectedMap.set(2, { insert: '', deleteLeft: 0 }); - assert.equal(result.size, 3); - assert.deepEqual(result, expectedMap); - }); - - it('properly aligns tokenization of transforms that match-replace existing tokens (2)', () => { - const context = { - left: 'do it properly', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // Case: the user had input a backspace and then selected a suggestion that restored - // the original word (which also appended whitespace). - const editTransform = { - insert: 'properly ', - deleteLeft: 8 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, { insert: 'properly', deleteLeft: 8 }); - expectedMap.set(1, { insert: ' ', deleteLeft: 0 }); - expectedMap.set(2, { insert: '', deleteLeft: 0 }); - assert.equal(result.size, 3); - assert.deepEqual(result, expectedMap); - }); - - it('properly places extra whitespaces on preceding whitespace token', () => { - const context = { - left: 'do it properly ', // 'do', ' ', 'it', ' ', 'properly', ' ', '' - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // Adjacent whitespace entries are generally merged into a single blob. - const editTransform = { - insert: ' ', // Should be combined with the final ' ', not the tail ''. - deleteLeft: 0 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(-1, { insert: ' ', deleteLeft: 0 }); - expectedMap.set(0, { insert: '', deleteLeft: 0 }); - assert.equal(result.size, 2); - assert.deepEqual(result, expectedMap); - }); - - it('properly aligns degenerate input cases (1)', () => { - const context = { - left: 'quick brown fox', // 'quick', ' ', 'brown', ' ', 'fox' - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'fox and brown fox', // => quick fox and brown fox - deleteLeft: 9 - }; - - const result = tokenizeTransform( - defaultTokenize, - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(-2, { insert: 'fox', deleteLeft: 5 }); - expectedMap.set(-1, { insert: ' ', deleteLeft: 1 }); - expectedMap.set(0, { insert: 'and', deleteLeft: 3 }); - expectedMap.set(1, { insert: ' ', deleteLeft: 0 }); - expectedMap.set(2, { insert: 'brown', deleteLeft: 0 }); - expectedMap.set(3, { insert: ' ', deleteLeft: 0 }); - expectedMap.set(4, { insert: 'fox', deleteLeft: 0 }); - assert.equal(result.size, 7); - assert.deepEqual(result, expectedMap); - }); - }); - - describe('with mocked dictionary-based wordbreaking', () => { - function mockedTokenization(map: Map) { - return (context: Context) => { - let tokens = map.get(context.left); - if(!tokens) { - assert.fail("Mocked tokenization was not properly constructed"); - } - return { - left: tokens.map((text) => { - return {text: text} as Token - }) - } as Tokenization; - } - } - - it('properly handles simple token-edit transform', () => { - const context = { - left: 'anappleadate', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'y', - deleteLeft: 2 - }; - - const mockMap = new Map(); - mockMap.set('anappleadate', ['an', 'apple', 'a', 'date']); - mockMap.set('anappleaday', ['an', 'apple', 'a', 'day']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, editTransform); // The original token before the text insertion point. - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('properly handles simple token-replacing transform', () => { - const context = { - left: 'anappleadate', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'week', - deleteLeft: 4 - }; - - const mockMap = new Map(); - mockMap.set('anappleadate', ['an', 'apple', 'a', 'date']); - mockMap.set('anappleaweek', ['an', 'apple', 'a', 'week']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, editTransform); // The original token before the text insertion point. - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles simple token-replacing transform with cross-token deleteLeft', () => { - const context = { - left: 'anappleadate', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // 'an apple any' - const editTransform = { - insert: 'ny', - deleteLeft: 4 - }; - - const mockMap = new Map(); - mockMap.set('anappleadate', ['an', 'apple', 'a', 'date']); - mockMap.set('anappleany', ['an', 'apple', 'any']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(-1, { insert: 'ny', deleteLeft: 0 }); - expectedMap.set( 0, { insert: '', deleteLeft: 4 }); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles word-breakable transforms (case 1)', () => { - const context = { - left: 'anappleadat', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'yk', - deleteLeft: 1 - }; - - const mockMap = new Map(); - mockMap.set('anappleadat', ['an', 'apple', 'a', 'dat']); - mockMap.set('anappleadayk', ['an', 'apple', 'a', 'day', 'k']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, { insert: 'y', deleteLeft: 1 }); - expectedMap.set(1, { insert: 'k', deleteLeft: 0 }); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles word-breakable transforms (case 2)', () => { - const context = { - left: 'anappleadat', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'y.', - deleteLeft: 1 - }; - - const mockMap = new Map(); - mockMap.set('anappleadat', ['an', 'apple', 'a', 'dat']); - mockMap.set('anappleaday.', ['an', 'apple', 'a', 'day', '.']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, { insert: 'y', deleteLeft: 1 }); - expectedMap.set(1, { insert: '.', deleteLeft: 0 }); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles word-breakable transforms (case 2 alternate output)', () => { - const context = { - left: 'anappleadat', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - const editTransform = { - insert: 'y.', - deleteLeft: 1 - }; - - const mockMap = new Map(); - mockMap.set('anappleadat', ['an', 'apple', 'a', 'dat']); - mockMap.set('anappleaday.', ['an', 'apple', 'a', 'day', '.', '']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(0, { insert: 'y', deleteLeft: 1 }); - expectedMap.set(1, { insert: '.', deleteLeft: 0 }); - expectedMap.set(2, { insert: '', deleteLeft: 0}); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - - it('handles complex breakable cases', () => { - const context = { - left: 'anappleadate', - right: '', - startOfBuffer: true, - endOfBuffer: true - }; - - // 'an apple any' - const editTransform = { - insert: 'nyday', - deleteLeft: 4 - }; - - const mockMap = new Map(); - mockMap.set('anappleadate', ['an', 'apple', 'a', 'date']); - mockMap.set('anappleanyday', ['an', 'apple', 'any', 'day']); - const result = tokenizeTransform( - mockedTokenization(mockMap), - context, - editTransform - ); - - const expectedMap = new Map(); - expectedMap.set(-1, { insert: 'ny', deleteLeft: 0 }); - expectedMap.set( 0, { insert: 'day', deleteLeft: 4 }); - assert.equal(result.size, expectedMap.size); - assert.deepEqual(result, expectedMap); - }); - }); -}); \ No newline at end of file