diff --git a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js index 6a4148b6695..fa09ca20dec 100644 --- a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js +++ b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js @@ -45,7 +45,7 @@ describe('ContextTracker', function() { assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); }); - it("properly matches and aligns when a 'wordbreak' is added'", function() { + it("properly matches and aligns when a 'wordbreak' is added", function() { let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"]; let transform = { insert: ' ', @@ -56,7 +56,7 @@ describe('ContextTracker', function() { let rawTokens = ["an", null, "apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""]; let existingState = ContextTracker.modelContextState(existingContext); - let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform)); + let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform)); assert.isNotNull(state); assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); @@ -65,6 +65,26 @@ describe('ContextTracker', function() { assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions); }); + it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() { + let existingContext = ["'"]; + let transform = { + insert: 'a', + deleteLeft: 0 + } + let newContext = Array.from(existingContext); + newContext.push('a'); // The incoming transform should produce a new token WITH TEXT. + let rawTokens = ["'", null, "a"]; + + let existingState = ContextTracker.modelContextState(existingContext); + let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform)); + assert.isNotNull(state); + assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); + + // The 'wordbreak' transform + assert.isEmpty(state.tokens[state.tokens.length - 2].transformDistributions); + assert.isNotEmpty(state.tokens[state.tokens.length - 1].transformDistributions); + }); + it("properly matches and aligns when lead token is removed AND a 'wordbreak' is added'", function() { let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"]; let transform = { @@ -77,7 +97,7 @@ describe('ContextTracker', function() { let rawTokens = ["apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""]; let existingState = ContextTracker.modelContextState(existingContext); - let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform)); + let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform)); assert.isNotNull(state); assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); diff --git a/common/predictive-text/unit_tests/headless/transform-utils.js b/common/predictive-text/unit_tests/headless/transform-utils.js new file mode 100644 index 00000000000..b00a24a1230 --- /dev/null +++ b/common/predictive-text/unit_tests/headless/transform-utils.js @@ -0,0 +1,58 @@ +var assert = require('chai').assert; + +let TransformUtils = require('../../../web/lm-worker/build/intermediate.js').TransformUtils; + +describe('TransformUtils', function () { + describe('isWhitespace', function () { + it("should not match a string containing standard alphabetic characters", function () { + let testTransforms = [{ + insert: "a ", + deleteLeft: 0 + }, { + insert: " a", + deleteLeft: 0 + }, { + insert: "ab", + deleteLeft: 0 + }]; + + testTransforms.forEach((transform) => assert.isFalse(TransformUtils.isWhitespace(transform), `failed with: '${transform.insert}'`)); + }); + + it("should match a simple ' ' transform", function() { + transform = { + insert: " ", + deleteLeft: 0 + }; + + assert.isTrue(TransformUtils.isWhitespace(transform)); + }); + + it("should match a simple ' ' transform with delete-left", function() { + transform = { + insert: " ", + deleteLeft: 1 + }; + + assert.isTrue(TransformUtils.isWhitespace(transform)); + }); + + it("should match a transform consisting of multiple characters of only whitespace", function() { + transform = { + insert: " \n\r\u00a0\t\u2000 ", + deleteLeft: 0 + }; + + assert.isTrue(TransformUtils.isWhitespace(transform)); + }); + + it("stress tests", function() { + transform = { + insert: " \n\r\u00a0\ta\u2000 ", // the 'a' should cause failure. + deleteLeft: 0 + }; + + assert.isFalse(TransformUtils.isWhitespace(transform)); + }); + }); +}); diff --git a/common/predictive-text/unit_tests/headless/worker-model-compositor.js b/common/predictive-text/unit_tests/headless/worker-model-compositor.js index f7e9173cee1..dc8a36e9dc0 100644 --- a/common/predictive-text/unit_tests/headless/worker-model-compositor.js +++ b/common/predictive-text/unit_tests/headless/worker-model-compositor.js @@ -66,12 +66,53 @@ describe('ModelCompositor', function() { // Suggestions always delete the full root of the suggestion. // // After a backspace, that means the text 'the' - 3 chars. - // Char 4 is for the original backspace, as suggstions are built + // Char 4 is for the original backspace, as suggestions are built // based on the context state BEFORE the triggering input - // here, a backspace. assert.equal(suggestion.transform.deleteLeft, 4); }); }); + + it('properly handles suggestions for the first letter after a ` `', function() { + let compositor = new ModelCompositor(plainModel); + let context = { + left: 'the', startOfBuffer: true, endOfBuffer: true, + }; + + let inputTransform = { + insert: ' ', + deleteLeft: 0 + }; + + let suggestions = compositor.predict(inputTransform, context); + suggestions.forEach(function(suggestion) { + // After a space, predictions are based on a new, zero-length root. + // With nothing to replace, .deleteLeft should be zero. + assert.equal(suggestion.transform.deleteLeft, 0); + }); + }); + + it('properly handles suggestions for the first letter after a `\'`', function() { + let compositor = new ModelCompositor(plainModel); + let context = { + left: "the '", startOfBuffer: true, endOfBuffer: true, + }; + + // This results in a new word boundary (between the `'` and the `a`). + // Basically, an implied (but nonexistent) ` `. + let inputTransform = { + insert: "a", + deleteLeft: 0 + }; + + let suggestions = compositor.predict(inputTransform, context); + suggestions.forEach(function(suggestion) { + // Suggestions always delete the full root of the suggestion. + // Which, here, didn't exist before the input. Nothing to + // replace => nothing for the suggestion to delete. + assert.equal(suggestion.transform.deleteLeft, 0); + }); + }); }); describe('applySuggestionCasing', function() { diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 52e89bedc87..e17e201d785 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -32,10 +32,6 @@ namespace correction { replacements: TrackedContextSuggestion[]; activeReplacementId: number = -1; - get isNew(): boolean { - return this.transformDistributions.length == 0; - } - get currentText(): string { if(this.replacementText === undefined || this.replacementText === null) { return this.raw; @@ -89,7 +85,7 @@ namespace correction { if(token.replacementText) { copy.replacementText = token.replacementText; } - + return copy; }); this.searchSpace = obj.searchSpace; @@ -139,8 +135,8 @@ namespace correction { // Track the Transform that resulted in the whitespace 'token'. // Will be needed for phrase-level correction/prediction. - whitespaceToken.transformDistributions = [transformDistribution]; - + whitespaceToken.transformDistributions = transformDistribution ? [transformDistribution] : []; + whitespaceToken.raw = null; this.tokens.push(whitespaceToken); } @@ -149,19 +145,19 @@ namespace correction { * Used for 14.0's backspace workaround, which flattens all previous Distribution * entries because of limitations with direct use of backspace transforms. * @param tokenText - * @param transformId + * @param transformId */ replaceTailForBackspace(tokenText: USVString, transformId: number) { this.tokens.pop(); // It's a backspace transform; time for special handling! // - // For now, with 14.0, we simply compress all remaining Transforms for the token into - // multiple single-char transforms. Probabalistically modeling BKSP is quite complex, + // For now, with 14.0, we simply compress all remaining Transforms for the token into + // multiple single-char transforms. Probabalistically modeling BKSP is quite complex, // so we simplify by assuming everything remaining after a BKSP is 'true' and 'intended' text. // // Note that we cannot just use a single, monolithic transform at this point b/c - // of our current edit-distance optimization strategy; diagonalization is currently... + // of our current edit-distance optimization strategy; diagonalization is currently... // not very compatible with that. let backspacedTokenContext: Distribution[] = textToCharTransforms(tokenText, transformId).map(function(transform) { return [{sample: transform, p: 1.0}]; @@ -175,7 +171,7 @@ namespace correction { updateTail(transformDistribution: Distribution, tokenText?: USVString) { let editedToken = this.tail; - + // Preserve existing text if new text isn't specified. tokenText = tokenText || (tokenText === '' ? '' : editedToken.raw); @@ -191,7 +187,7 @@ namespace correction { toRawTokenization() { let sequence: USVString[] = []; - + for(let token of this.tokens) { // Hide any tokens representing wordbreaks. (Thinking ahead to phrase-level possibilities) if(token.currentText !== null) { @@ -281,7 +277,7 @@ namespace correction { /** * Returns items contained within the circular array, ordered from 'oldest' to 'newest' - * the same order in which the items will be dequeued. - * @param index + * @param index */ item(index: number) { if(index >= this.count) { @@ -294,7 +290,7 @@ namespace correction { } export class ContextTracker extends CircularArray { - static attemptMatchContext(tokenizedContext: USVString[], + static attemptMatchContext(tokenizedContext: USVString[], matchState: TrackedContextState, transformDistribution?: Distribution,): TrackedContextState { // Map the previous tokenized state to an edit-distance friendly version. @@ -335,7 +331,7 @@ namespace correction { } // Can happen for the first text input after backspace deletes a wordbreaking character, - // thus the new input continues a previous word while dropping the empty word after + // thus the new input continues a previous word while dropping the empty word after // that prior wordbreaking character. // // We can't handle it reliably from this match state, but a previous entry (without the empty token) @@ -353,7 +349,7 @@ namespace correction { // If we've made it here... success! We have a context match! let state: TrackedContextState; - + if(pushedTail) { // On suggestion acceptance, we should update the previous final token. // We do it first so that the acceptance is replicated in the new TrackedContextState @@ -376,7 +372,9 @@ namespace correction { if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) { primaryInput = null; } - const isBackspace = primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft > 0 && !primaryInput.deleteRight; + + const isWhitespace = primaryInput && TransformUtils.isWhitespace(primaryInput); + const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput); const finalToken = tokenizedContext[tokenizedContext.length-1]; /* Assumption: This is an adequate check for its two sub-branches. @@ -388,7 +386,7 @@ namespace correction { * - Assumption: one keystroke may only cause a single token to be appended to the context * - That is, no "reasonable" keystroke would emit a Transform adding two separate word tokens * - For languages using whitespace to word-break, said keystroke would have to include said whitespace to break the assumption. - */ + */ // If there is/was more than one context token available... if(editPath.length > 1) { @@ -399,17 +397,29 @@ namespace correction { // We're adding an additional context token. if(pushedTail) { - // ASSUMPTION: any transform that triggers this case is a pure-whitespace Transform, as we - // need a word-break before beginning a new word's context. - // Worth note: when invalid, the lm-layer already has problems in other aspects too. - state.pushWhitespaceToTail(transformDistribution); - - let emptyToken = new TrackedContextToken(); - emptyToken.raw = ''; - // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters - // for the new word (token), so the input keystrokes do not correspond to the new text token. - emptyToken.transformDistributions = []; - state.pushTail(emptyToken); + const tokenizedTail = tokenizedContext[tokenizedContext.length - 1]; + /* + * Common-case: most transforms that trigger this case are from pure-whitespace Transforms. MOST. + * + * Less-common, but noteworthy: some wordbreaks may occur without whitespace. Example: + * `"o` => ['"', 'o']. Make sure to double-check against `tokenizedContext`! + */ + let pushedToken = new TrackedContextToken(); + pushedToken.raw = tokenizedTail; + + if(isWhitespace || !primaryInput) { + state.pushWhitespaceToTail(transformDistribution ?? []); + // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters + // for the new word (token), so the input keystrokes do not correspond to the new text token. + pushedToken.transformDistributions = []; + } else { + state.pushWhitespaceToTail(); + // Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing + // any metadata used to construct the new context state token. + pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : []; + } + + state.pushTail(pushedToken); } else { // We're editing the final context token. // TODO: Assumption: we didn't 'miss' any inputs somehow. // As is, may be prone to fragility should the lm-layer's tracked context 'desync' from its host's. @@ -442,7 +452,9 @@ namespace correction { return state; } - static modelContextState(tokenizedContext: USVString[], lexicalModel: LexicalModel): TrackedContextState { + static modelContextState(tokenizedContext: USVString[], + transformDistribution: Distribution, + lexicalModel: LexicalModel): TrackedContextState { let baseTokens = tokenizedContext.map(function(entry) { let token = new TrackedContextToken(); token.raw = entry; @@ -483,13 +495,12 @@ namespace correction { * Compares the current, post-input context against the most recently-seen contexts from previous prediction calls, returning * the most information-rich `TrackedContextState` possible. If a match is found, the state will be annotated with the * input information provided to previous prediction calls and persisted correction-search calculations for re-use. - * - * @param model - * @param context - * @param mainTransform - * @param transformDistribution + * + * @param model + * @param context + * @param transformDistribution */ - analyzeState(model: LexicalModel, + analyzeState(model: LexicalModel, context: Context, transformDistribution?: Distribution): TrackedContextState { if(!model.traverseFromRoot) { @@ -519,7 +530,7 @@ namespace correction { // // Assumption: as a caret needs to move to context before any actual transform distributions occur, // this state is only reached on caret moves; thus, transformDistribution is actually just a single null transform. - let state = ContextTracker.modelContextState(tokenizedContext.left, model); + let state = ContextTracker.modelContextState(tokenizedContext.left, transformDistribution, model); state.taggedContext = context; this.enqueue(state); return state; diff --git a/common/web/lm-worker/src/index.ts b/common/web/lm-worker/src/index.ts index 7cc3bcbde1d..e3c16fa2c86 100644 --- a/common/web/lm-worker/src/index.ts +++ b/common/web/lm-worker/src/index.ts @@ -32,6 +32,7 @@ /// /// /// +/// /** * Encapsulates all the state required for the LMLayer's worker thread. @@ -407,6 +408,7 @@ if (typeof module !== 'undefined' && typeof module.exports !== 'undefined') { module.exports['wordBreakers'] = wordBreakers; /// XXX: export the ModelCompositor for testing. module.exports['ModelCompositor'] = ModelCompositor; + module.exports['TransformUtils'] = TransformUtils; } else if (typeof self !== 'undefined' && 'postMessage' in self && 'importScripts' in self) { // Automatically install if we're in a Web Worker. LMLayerWorker.install(self as any); // really, 'as typeof globalThis', but we're currently getting TS errors from use of that. diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts index 329460776d8..9a7da82a803 100644 --- a/common/web/lm-worker/src/model-compositor.ts +++ b/common/web/lm-worker/src/model-compositor.ts @@ -16,30 +16,6 @@ class ModelCompositor { this.punctuation = ModelCompositor.determinePunctuationFromModel(lexicalModel); } - protected isWhitespace(transform: Transform): boolean { - // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab. - let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i; - - // Filter out null-inserts; their high probability can cause issues. - if(transform.insert == '') { // Can actually register as 'whitespace'. - return false; - } - - let insert = transform.insert; - - insert = insert.replace(whitespaceRemover, ''); - - return insert == ''; - } - - protected isBackspace(transform: Transform): boolean { - return transform.insert == "" && transform.deleteLeft > 0; - } - - protected isEmpty(transform: Transform): boolean { - return transform.insert == '' && transform.deleteLeft == 0; - } - private predictFromCorrections(corrections: ProbabilityMass[], context: Context): Distribution { let returnedPredictions: Distribution = []; @@ -98,8 +74,8 @@ class ModelCompositor { })[0].sample; // Only allow new-word suggestions if space was the most likely keypress. - let allowSpace = this.isWhitespace(inputTransform); - let allowBksp = this.isBackspace(inputTransform); + let allowSpace = TransformUtils.isWhitespace(inputTransform); + let allowBksp = TransformUtils.isBackspace(inputTransform); let postContext = models.applyTransform(inputTransform, context); let keepOptionText = this.wordbreak(postContext); @@ -109,7 +85,7 @@ class ModelCompositor { // Used to restore whitespaces if operations would remove them. let prefixTransform: Transform; - let contextState: correction.TrackedContextState = null; + let postContextState: correction.TrackedContextState = null; // Section 1: determining 'prediction roots'. if(!this.contextTracker) { @@ -124,18 +100,18 @@ class ModelCompositor { predictionRoots = [{sample: inputTransform, p: 1.0}]; prefixTransform = inputTransform; } else { - predictionRoots = transformDistribution.map(function(alt) { + predictionRoots = transformDistribution.map((alt) => { let transform = alt.sample; // Filter out special keys unless they're expected. - if(this.isWhitespace(transform) && !allowSpace) { + if(TransformUtils.isWhitespace(transform) && !allowSpace) { return null; - } else if(this.isBackspace(transform) && !allowBksp) { + } else if(TransformUtils.isBackspace(transform) && !allowBksp) { return null; } return alt; - }, this); + }); } // Remove `null` entries. @@ -144,12 +120,15 @@ class ModelCompositor { // Running in bulk over all suggestions, duplicate entries may be possible. rawPredictions = this.predictFromCorrections(predictionRoots, context); } else { - contextState = this.contextTracker.analyzeState(this.lexicalModel, - postContext, - !this.isEmpty(inputTransform) ? - transformDistribution: - null - ); + // Token replacement benefits greatly from knowledge of the prior context state. + let contextState = this.contextTracker.analyzeState(this.lexicalModel, context, null); + // Corrections and predictions are based upon the post-context state, though. + postContextState = this.contextTracker.analyzeState(this.lexicalModel, + postContext, + !TransformUtils.isEmpty(inputTransform) ? + transformDistribution: + null + ); // TODO: Should we filter backspaces & whitespaces out of the transform distribution? // Ideally, the answer (in the future) will be no, but leaving it in right now may pose an issue. @@ -158,17 +137,68 @@ class ModelCompositor { // let's just note that right now, there will only ever be one. // // The 'eventual' logic will be significantly more complex, though still manageable. - let searchSpace = contextState.searchSpace[0]; - - let newEmptyToken = false; - // Detect if we're starting a new context state. - let contextTokens = contextState.tokens; - if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) { - if(this.isEmpty(inputTransform) || this.isWhitespace(inputTransform)) { - newEmptyToken = true; + let searchSpace = postContextState.searchSpace[0]; + + // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the + // same amount of text. We can handle this before the big 'prediction root' loop. + let deleteLeft = 0; + + // The amount of text to 'replace' depends upon whatever sort of context change occurs + // from the received input. + let postContextLength = postContextState.tokens.length; + let contextLengthDelta = postContextState.tokens.length - contextState.tokens.length; + // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist. + if(postContextLength == 0 || contextLengthDelta > 0) { + // As the word/token being corrected/predicted didn't originally exist, there's no + // part of it to 'replace'. + deleteLeft = 0; + + // If the new token is due to whitespace or due to a different input type that would + // likely imply a tokenization boundary... + if(TransformUtils.isWhitespace(inputTransform)) { + /* TODO: consider/implement: the second half of the comment above. + * For example: on input of a `'`, predict new words instead of replacing the `'`. + * (since after a letter, the `'` will be ignored, anyway) + * + * Idea: if the model's most likely prediction (with no root) would make a new + * token if appended to the current token, that's probably a good case. + * Keeps the check simple & quick. + * + * Might need a mixed mode, though: ';' is close enough that `l` is a reasonable + * fat-finger guess. So yeah, we're not addressing this idea right now. + * - so... consider multiple context behavior angles when building prediction roots? + * + * May need something similar to help handle contractions during their construction, + * but that'd be within `ContextTracker`. + * can' => [`can`, `'`] + * can't => [`can't`] (WB6, 7 of https://unicode.org/reports/tr29/#Word_Boundary_Rules) + * + * (Would also helps WB7b+c for Hebrew text) + */ + + // Infer 'new word' mode, even if we received new text when reaching + // this position. That new text didn't exist before, so still - nothing + // to 'replace'. prefixTransform = inputTransform; - context = postContext; // Ensure the whitespace token is preapplied! + context = postContext; // As far as predictions are concerned, the post-context state + // should not be replaced. Predictions are to be rooted on + // text "up for correction" - so we want a null root for this + // branch. + contextState = postContextState; } + // If the tokenized context length is shorter... sounds like a backspace (or similar). + } else if (contextLengthDelta < 0) { + /* Ooh, we've dropped context here. Almost certainly from a backspace. + * Even if we drop multiple tokens... well, we know exactly how many chars + * were actually deleted - `inputTransform.deleteLeft`. + * Since we replace a word being corrected/predicted, we take length of the remaining + * context's tail token in addition to however far was deleted to reach that state. + */ + deleteLeft = this.wordbreak(postContext).kmwLength() + inputTransform.deleteLeft; + } else { + // Suggestions are applied to the pre-input context, so get the token's original length. + // We're on the same token, so just delete its text for the replacement op. + deleteLeft = this.wordbreak(context).kmwLength(); } // TODO: whitespace, backspace filtering. Do it here. @@ -192,19 +222,6 @@ class ModelCompositor { finalInput = inputTransform; // A fallback measure. Greatly matters for empty contexts. } - let deleteLeft = 0; - // remove actual token string. If new token, there should be nothing to delete. - if(!newEmptyToken) { - // If this is triggered from a backspace, make sure to use its results - // and also include its left-deletions! It's the one post-input context case. - if(allowBksp) { - deleteLeft = this.wordbreak(postContext).kmwLength() + inputTransform.deleteLeft; - } else { - // Normal case - use the pre-input context. - deleteLeft = this.wordbreak(context).kmwLength(); - } - } - // Replace the existing context with the correction. let correctionTransform: Transform = { insert: correction, // insert correction string @@ -411,8 +428,8 @@ class ModelCompositor { // Store the suggestions on the final token of the current context state (if it exists). // Or, once phrase-level suggestions are possible, on whichever token serves as each prediction's root. - if(contextState) { - contextState.tail.replacements = suggestions.map(function(suggestion) { + if(postContextState) { + postContextState.tail.replacements = suggestions.map(function(suggestion) { return { suggestion: suggestion, tokenWidth: 1 @@ -659,7 +676,7 @@ class ModelCompositor { // than before. if(this.contextTracker) { let tokenizedContext = models.tokenize(this.lexicalModel.wordbreaker || wordBreakers.default, context); - let contextState = correction.ContextTracker.modelContextState(tokenizedContext.left, this.lexicalModel); + let contextState = correction.ContextTracker.modelContextState(tokenizedContext.left, null, this.lexicalModel); this.contextTracker.enqueue(contextState); } } diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts new file mode 100644 index 00000000000..cbb2c151fe7 --- /dev/null +++ b/common/web/lm-worker/src/transformUtils.ts @@ -0,0 +1,15 @@ +class TransformUtils { + static isWhitespace(transform: Transform): boolean { + // Matches a string that is entirely one or more characters with Unicode general property Z* or the following: CR, LF, and Tab. + const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i; + return transform.insert.match(whitespaceRemover) != null; + } + + static isBackspace(transform: Transform): boolean { + return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight; + } + + static isEmpty(transform: Transform): boolean { + return transform.insert == '' && transform.deleteLeft == 0 && !transform.deleteRight; + } +} \ No newline at end of file