From ad8c30c6c07b7f5272c1b1e55976a9b146fbbf06 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 10:26:31 +0700 Subject: [PATCH 01/19] fix(web): pred-text context tracking when wordbreak not caused by whitespace --- .../src/correction/context-tracker.ts | 80 +++++++++++-------- common/web/lm-worker/src/model-compositor.ts | 36 ++------- common/web/lm-worker/src/transformUtils.ts | 25 ++++++ 3 files changed, 79 insertions(+), 62 deletions(-) create mode 100644 common/web/lm-worker/src/transformUtils.ts diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 52e89bedc87..1a94723546c 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -89,7 +89,7 @@ namespace correction { if(token.replacementText) { copy.replacementText = token.replacementText; } - + return copy; }); this.searchSpace = obj.searchSpace; @@ -139,8 +139,10 @@ namespace correction { // Track the Transform that resulted in the whitespace 'token'. // Will be needed for phrase-level correction/prediction. - whitespaceToken.transformDistributions = [transformDistribution]; - + if(transformDistribution) { + whitespaceToken.transformDistributions = [transformDistribution]; + } + whitespaceToken.raw = null; this.tokens.push(whitespaceToken); } @@ -149,19 +151,19 @@ namespace correction { * Used for 14.0's backspace workaround, which flattens all previous Distribution * entries because of limitations with direct use of backspace transforms. * @param tokenText - * @param transformId + * @param transformId */ replaceTailForBackspace(tokenText: USVString, transformId: number) { this.tokens.pop(); // It's a backspace transform; time for special handling! // - // For now, with 14.0, we simply compress all remaining Transforms for the token into - // multiple single-char transforms. Probabalistically modeling BKSP is quite complex, + // For now, with 14.0, we simply compress all remaining Transforms for the token into + // multiple single-char transforms. Probabalistically modeling BKSP is quite complex, // so we simplify by assuming everything remaining after a BKSP is 'true' and 'intended' text. // // Note that we cannot just use a single, monolithic transform at this point b/c - // of our current edit-distance optimization strategy; diagonalization is currently... + // of our current edit-distance optimization strategy; diagonalization is currently... // not very compatible with that. let backspacedTokenContext: Distribution[] = textToCharTransforms(tokenText, transformId).map(function(transform) { return [{sample: transform, p: 1.0}]; @@ -175,7 +177,7 @@ namespace correction { updateTail(transformDistribution: Distribution, tokenText?: USVString) { let editedToken = this.tail; - + // Preserve existing text if new text isn't specified. tokenText = tokenText || (tokenText === '' ? '' : editedToken.raw); @@ -191,7 +193,7 @@ namespace correction { toRawTokenization() { let sequence: USVString[] = []; - + for(let token of this.tokens) { // Hide any tokens representing wordbreaks. (Thinking ahead to phrase-level possibilities) if(token.currentText !== null) { @@ -281,7 +283,7 @@ namespace correction { /** * Returns items contained within the circular array, ordered from 'oldest' to 'newest' - * the same order in which the items will be dequeued. - * @param index + * @param index */ item(index: number) { if(index >= this.count) { @@ -294,7 +296,7 @@ namespace correction { } export class ContextTracker extends CircularArray { - static attemptMatchContext(tokenizedContext: USVString[], + static attemptMatchContext(tokenizedContext: USVString[], matchState: TrackedContextState, transformDistribution?: Distribution,): TrackedContextState { // Map the previous tokenized state to an edit-distance friendly version. @@ -335,7 +337,7 @@ namespace correction { } // Can happen for the first text input after backspace deletes a wordbreaking character, - // thus the new input continues a previous word while dropping the empty word after + // thus the new input continues a previous word while dropping the empty word after // that prior wordbreaking character. // // We can't handle it reliably from this match state, but a previous entry (without the empty token) @@ -353,7 +355,7 @@ namespace correction { // If we've made it here... success! We have a context match! let state: TrackedContextState; - + if(pushedTail) { // On suggestion acceptance, we should update the previous final token. // We do it first so that the acceptance is replicated in the new TrackedContextState @@ -376,7 +378,9 @@ namespace correction { if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) { primaryInput = null; } - const isBackspace = primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft > 0 && !primaryInput.deleteRight; + + const isWhitespace = primaryInput && TransformUtils.isWhitespace(primaryInput); + const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput); const finalToken = tokenizedContext[tokenizedContext.length-1]; /* Assumption: This is an adequate check for its two sub-branches. @@ -388,7 +392,7 @@ namespace correction { * - Assumption: one keystroke may only cause a single token to be appended to the context * - That is, no "reasonable" keystroke would emit a Transform adding two separate word tokens * - For languages using whitespace to word-break, said keystroke would have to include said whitespace to break the assumption. - */ + */ // If there is/was more than one context token available... if(editPath.length > 1) { @@ -399,17 +403,29 @@ namespace correction { // We're adding an additional context token. if(pushedTail) { - // ASSUMPTION: any transform that triggers this case is a pure-whitespace Transform, as we - // need a word-break before beginning a new word's context. - // Worth note: when invalid, the lm-layer already has problems in other aspects too. - state.pushWhitespaceToTail(transformDistribution); - - let emptyToken = new TrackedContextToken(); - emptyToken.raw = ''; - // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters - // for the new word (token), so the input keystrokes do not correspond to the new text token. - emptyToken.transformDistributions = []; - state.pushTail(emptyToken); + const tokenizedTail = tokenizedContext[tokenizedContext.length - 1]; + /* + * Common-case: most transforms that trigger this case are from pure-whitespace Transforms. MOST. + * + * Less-common, but noteworthy: some wordbreaks may occur without whitespace. Example: + * `"o` => ['"', 'o']. Make sure to double-check against `tokenizedContext`! + */ + let pushedToken = new TrackedContextToken(); + pushedToken.raw = tokenizedTail; + + if(isWhitespace) { + state.pushWhitespaceToTail(transformDistribution); + // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters + // for the new word (token), so the input keystrokes do not correspond to the new text token. + pushedToken.transformDistributions = []; + } else { + state.pushWhitespaceToTail(); + // Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing + // any metadata used to construct the new context state token. + pushedToken.transformDistributions = [transformDistribution]; + } + + state.pushTail(pushedToken); } else { // We're editing the final context token. // TODO: Assumption: we didn't 'miss' any inputs somehow. // As is, may be prone to fragility should the lm-layer's tracked context 'desync' from its host's. @@ -483,13 +499,13 @@ namespace correction { * Compares the current, post-input context against the most recently-seen contexts from previous prediction calls, returning * the most information-rich `TrackedContextState` possible. If a match is found, the state will be annotated with the * input information provided to previous prediction calls and persisted correction-search calculations for re-use. - * - * @param model - * @param context - * @param mainTransform - * @param transformDistribution + * + * @param model + * @param context + * @param mainTransform + * @param transformDistribution */ - analyzeState(model: LexicalModel, + analyzeState(model: LexicalModel, context: Context, transformDistribution?: Distribution): TrackedContextState { if(!model.traverseFromRoot) { diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts index 329460776d8..84687c867d4 100644 --- a/common/web/lm-worker/src/model-compositor.ts +++ b/common/web/lm-worker/src/model-compositor.ts @@ -16,30 +16,6 @@ class ModelCompositor { this.punctuation = ModelCompositor.determinePunctuationFromModel(lexicalModel); } - protected isWhitespace(transform: Transform): boolean { - // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab. - let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i; - - // Filter out null-inserts; their high probability can cause issues. - if(transform.insert == '') { // Can actually register as 'whitespace'. - return false; - } - - let insert = transform.insert; - - insert = insert.replace(whitespaceRemover, ''); - - return insert == ''; - } - - protected isBackspace(transform: Transform): boolean { - return transform.insert == "" && transform.deleteLeft > 0; - } - - protected isEmpty(transform: Transform): boolean { - return transform.insert == '' && transform.deleteLeft == 0; - } - private predictFromCorrections(corrections: ProbabilityMass[], context: Context): Distribution { let returnedPredictions: Distribution = []; @@ -98,8 +74,8 @@ class ModelCompositor { })[0].sample; // Only allow new-word suggestions if space was the most likely keypress. - let allowSpace = this.isWhitespace(inputTransform); - let allowBksp = this.isBackspace(inputTransform); + let allowSpace = TransformUtils.isWhitespace(inputTransform); + let allowBksp = TransformUtils.isBackspace(inputTransform); let postContext = models.applyTransform(inputTransform, context); let keepOptionText = this.wordbreak(postContext); @@ -146,9 +122,9 @@ class ModelCompositor { } else { contextState = this.contextTracker.analyzeState(this.lexicalModel, postContext, - !this.isEmpty(inputTransform) ? - transformDistribution: - null + !TransformUtils.isEmpty(inputTransform) ? + transformDistribution: + null ); // TODO: Should we filter backspaces & whitespaces out of the transform distribution? @@ -164,7 +140,7 @@ class ModelCompositor { // Detect if we're starting a new context state. let contextTokens = contextState.tokens; if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) { - if(this.isEmpty(inputTransform) || this.isWhitespace(inputTransform)) { + if(TransformUtils.isEmpty(inputTransform) || TransformUtils.isWhitespace(inputTransform)) { newEmptyToken = true; prefixTransform = inputTransform; context = postContext; // Ensure the whitespace token is preapplied! diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts new file mode 100644 index 00000000000..8512d4e9a20 --- /dev/null +++ b/common/web/lm-worker/src/transformUtils.ts @@ -0,0 +1,25 @@ +class TransformUtils { + static isWhitespace(transform: Transform): boolean { + // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab. + let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i; + + // Filter out null-inserts; their high probability can cause issues. + if(transform.insert == '') { // Can actually register as 'whitespace'. + return false; + } + + let insert = transform.insert; + + insert = insert.replace(whitespaceRemover, ''); + + return insert == ''; + } + + static isBackspace(transform: Transform): boolean { + return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight; + } + + static isEmpty(transform: Transform): boolean { + return transform.insert == '' && transform.deleteLeft == 0; + } +} \ No newline at end of file From be562b30225410afa77d65d1fbed58f6257f4784 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 10:37:50 +0700 Subject: [PATCH 02/19] fix(web): missed method references --- common/web/lm-worker/src/model-compositor.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts index 84687c867d4..657a7b1ec10 100644 --- a/common/web/lm-worker/src/model-compositor.ts +++ b/common/web/lm-worker/src/model-compositor.ts @@ -100,18 +100,18 @@ class ModelCompositor { predictionRoots = [{sample: inputTransform, p: 1.0}]; prefixTransform = inputTransform; } else { - predictionRoots = transformDistribution.map(function(alt) { + predictionRoots = transformDistribution.map((alt) => { let transform = alt.sample; // Filter out special keys unless they're expected. - if(this.isWhitespace(transform) && !allowSpace) { + if(TransformUtils.isWhitespace(transform) && !allowSpace) { return null; - } else if(this.isBackspace(transform) && !allowBksp) { + } else if(TransformUtils.isBackspace(transform) && !allowBksp) { return null; } return alt; - }, this); + }); } // Remove `null` entries. From b0184ec98bcf1106119afab36b6b7d09b9db70e1 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 11:08:26 +0700 Subject: [PATCH 03/19] fix(web): adds unit test targeting issue, handler for edge case --- .../headless/edit-distance/context-tracker.js | 26 ++++++++++++++++--- .../src/correction/context-tracker.ts | 2 +- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js index 6a4148b6695..5960a476584 100644 --- a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js +++ b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js @@ -45,7 +45,7 @@ describe('ContextTracker', function() { assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); }); - it("properly matches and aligns when a 'wordbreak' is added'", function() { + it("properly matches and aligns when a 'wordbreak' is added", function() { let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"]; let transform = { insert: ' ', @@ -56,7 +56,7 @@ describe('ContextTracker', function() { let rawTokens = ["an", null, "apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""]; let existingState = ContextTracker.modelContextState(existingContext); - let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform)); + let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform)); assert.isNotNull(state); assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); @@ -65,6 +65,26 @@ describe('ContextTracker', function() { assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions); }); + it("properly matches and aligns when an implied 'wordbreak' occurs \"'\"", function() { + let existingContext = ["'"]; + let transform = { + insert: 'a', + deleteLeft: 0 + } + let newContext = Array.from(existingContext); + newContext.push('a'); // The incoming transform should produce a new token WITH TEXT. + let rawTokens = ["'", null, "a"]; + + let existingState = ContextTracker.modelContextState(existingContext); + let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform)); + assert.isNotNull(state); + assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); + + // The 'wordbreak' transform + assert.isEmpty(state.tokens[state.tokens.length - 2].transformDistributions); + assert.isNotEmpty(state.tokens[state.tokens.length - 1].transformDistributions); + }); + it("properly matches and aligns when lead token is removed AND a 'wordbreak' is added'", function() { let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"]; let transform = { @@ -77,7 +97,7 @@ describe('ContextTracker', function() { let rawTokens = ["apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""]; let existingState = ContextTracker.modelContextState(existingContext); - let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform)); + let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform)); assert.isNotNull(state); assert.deepEqual(state.tokens.map(token => token.raw), rawTokens); diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 1a94723546c..0f50657f0e9 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -413,7 +413,7 @@ namespace correction { let pushedToken = new TrackedContextToken(); pushedToken.raw = tokenizedTail; - if(isWhitespace) { + if(isWhitespace || !primaryInput) { state.pushWhitespaceToTail(transformDistribution); // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters // for the new word (token), so the input keystrokes do not correspond to the new text token. From 0a237f800fbbca7aa78483ae0f3371a71534b52f Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 11:31:57 +0700 Subject: [PATCH 04/19] chore(web): test name tweak --- .../unit_tests/headless/edit-distance/context-tracker.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js index 5960a476584..fa09ca20dec 100644 --- a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js +++ b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js @@ -65,7 +65,7 @@ describe('ContextTracker', function() { assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions); }); - it("properly matches and aligns when an implied 'wordbreak' occurs \"'\"", function() { + it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() { let existingContext = ["'"]; let transform = { insert: 'a', From d33a9f05bf999cbc476bb600f50eada8ee934395 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 11:41:42 +0700 Subject: [PATCH 05/19] docs(web): fixes missed doc update --- common/web/lm-worker/src/correction/context-tracker.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 0f50657f0e9..bc187fa03f5 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -502,7 +502,6 @@ namespace correction { * * @param model * @param context - * @param mainTransform * @param transformDistribution */ analyzeState(model: LexicalModel, From 57ce30e0685fb6e6c8c3d06207045d4960e2131d Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 12:17:17 +0700 Subject: [PATCH 06/19] fix(web): post-suggestion-apply error --- common/web/lm-worker/src/correction/context-tracker.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index bc187fa03f5..99842b8299a 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -414,7 +414,7 @@ namespace correction { pushedToken.raw = tokenizedTail; if(isWhitespace || !primaryInput) { - state.pushWhitespaceToTail(transformDistribution); + state.pushWhitespaceToTail(transformDistribution ?? []); // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters // for the new word (token), so the input keystrokes do not correspond to the new text token. pushedToken.transformDistributions = []; @@ -422,7 +422,7 @@ namespace correction { state.pushWhitespaceToTail(); // Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing // any metadata used to construct the new context state token. - pushedToken.transformDistributions = [transformDistribution]; + pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : []; } state.pushTail(pushedToken); From b47596a803cc9c65c329d85a33c8c1d3736f74e0 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 12:35:11 +0700 Subject: [PATCH 07/19] fix(common/models): context token .isNew maintenance --- common/web/lm-worker/src/correction/context-tracker.ts | 5 ++++- common/web/lm-worker/src/model-compositor.ts | 9 ++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 99842b8299a..08ca11b1368 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -27,13 +27,14 @@ namespace correction { export class TrackedContextToken { raw: string; replacementText: string; + newFlag: boolean = false; transformDistributions: Distribution[] = []; replacements: TrackedContextSuggestion[]; activeReplacementId: number = -1; get isNew(): boolean { - return this.transformDistributions.length == 0; + return this.newFlag; } get currentText(): string { @@ -126,6 +127,7 @@ namespace correction { } else { this.searchSpace = []; } + token.newFlag = true; this.tokens.push(token); let state = this; @@ -189,6 +191,7 @@ namespace correction { } // Replace old token's raw-text with new token's raw-text. editedToken.raw = tokenText; + editedToken.newFlag = false; } toRawTokenization() { diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts index 657a7b1ec10..6c40d36e981 100644 --- a/common/web/lm-worker/src/model-compositor.ts +++ b/common/web/lm-worker/src/model-compositor.ts @@ -136,12 +136,15 @@ class ModelCompositor { // The 'eventual' logic will be significantly more complex, though still manageable. let searchSpace = contextState.searchSpace[0]; - let newEmptyToken = false; + let newToken = false; // Detect if we're starting a new context state. let contextTokens = contextState.tokens; if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) { + // Always note if we have a new token (so that we don't try to delete existing context) + newToken = true; + // If the new token is due to whitespace, or if we had a context-reset trigger this (thus, no input...) + // (Lingering question: do we need the .isEmpty check here? Track `prefixTransform` and find out.) if(TransformUtils.isEmpty(inputTransform) || TransformUtils.isWhitespace(inputTransform)) { - newEmptyToken = true; prefixTransform = inputTransform; context = postContext; // Ensure the whitespace token is preapplied! } @@ -170,7 +173,7 @@ class ModelCompositor { let deleteLeft = 0; // remove actual token string. If new token, there should be nothing to delete. - if(!newEmptyToken) { + if(!newToken) { // If this is triggered from a backspace, make sure to use its results // and also include its left-deletions! It's the one post-input context case. if(allowBksp) { From 26c5150977cf60de364b6d2ef8aa893668d00aeb Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 5 Sep 2022 14:02:55 +0700 Subject: [PATCH 08/19] fix(web): context-tracker newFlag management for new contexts --- .../lm-worker/src/correction/context-tracker.ts | 17 +++++++++++++++-- common/web/lm-worker/src/model-compositor.ts | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 08ca11b1368..fba88261688 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -461,7 +461,9 @@ namespace correction { return state; } - static modelContextState(tokenizedContext: USVString[], lexicalModel: LexicalModel): TrackedContextState { + static modelContextState(tokenizedContext: USVString[], + transformDistribution: Distribution, + lexicalModel: LexicalModel): TrackedContextState { let baseTokens = tokenizedContext.map(function(entry) { let token = new TrackedContextToken(); token.raw = entry; @@ -495,6 +497,17 @@ namespace correction { state.pushTail(token); } + for(let i = 0; i < state.tokens.length - 1; i++) { + state.tokens[i].newFlag = false; + } + + const finalToken = state.tokens[state.tokens.length - 1]; + const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null; + + if(!baseTransform || baseTransform.sample.insert != finalToken.raw) { + finalToken.newFlag = false; + } + return state; } @@ -537,7 +550,7 @@ namespace correction { // // Assumption: as a caret needs to move to context before any actual transform distributions occur, // this state is only reached on caret moves; thus, transformDistribution is actually just a single null transform. - let state = ContextTracker.modelContextState(tokenizedContext.left, model); + let state = ContextTracker.modelContextState(tokenizedContext.left, transformDistribution, model); state.taggedContext = context; this.enqueue(state); return state; diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts index 6c40d36e981..8709ab0c6bb 100644 --- a/common/web/lm-worker/src/model-compositor.ts +++ b/common/web/lm-worker/src/model-compositor.ts @@ -638,7 +638,7 @@ class ModelCompositor { // than before. if(this.contextTracker) { let tokenizedContext = models.tokenize(this.lexicalModel.wordbreaker || wordBreakers.default, context); - let contextState = correction.ContextTracker.modelContextState(tokenizedContext.left, this.lexicalModel); + let contextState = correction.ContextTracker.modelContextState(tokenizedContext.left, null, this.lexicalModel); this.contextTracker.enqueue(contextState); } } From 9634c7635340a93babd6bee323941d97d797a54d Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Tue, 6 Sep 2022 08:57:56 +0700 Subject: [PATCH 09/19] chore(common/models): suggested tweak from review --- common/web/lm-worker/src/transformUtils.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts index 8512d4e9a20..3bc66f33ce3 100644 --- a/common/web/lm-worker/src/transformUtils.ts +++ b/common/web/lm-worker/src/transformUtils.ts @@ -16,10 +16,10 @@ class TransformUtils { } static isBackspace(transform: Transform): boolean { - return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight; + return transform.insert == "" && transform.deleteLeft > 0 && transform.deleteRight == 0; } static isEmpty(transform: Transform): boolean { - return transform.insert == '' && transform.deleteLeft == 0; + return transform.insert == '' && transform.deleteLeft == 0 && transform.deleteRight == 0; } } \ No newline at end of file From 9be8839685ae8d65fbae37ff5eb9ab836f863804 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Tue, 6 Sep 2022 10:47:51 +0700 Subject: [PATCH 10/19] fix(common/models): undefined != 0 --- common/web/lm-worker/src/transformUtils.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts index 3bc66f33ce3..b91951722f1 100644 --- a/common/web/lm-worker/src/transformUtils.ts +++ b/common/web/lm-worker/src/transformUtils.ts @@ -16,10 +16,10 @@ class TransformUtils { } static isBackspace(transform: Transform): boolean { - return transform.insert == "" && transform.deleteLeft > 0 && transform.deleteRight == 0; + return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight; } static isEmpty(transform: Transform): boolean { - return transform.insert == '' && transform.deleteLeft == 0 && transform.deleteRight == 0; + return transform.insert == '' && transform.deleteLeft == 0 && !transform.deleteRight; } } \ No newline at end of file From fb0cc5086cd5f0abbf6d0cac4d166cd691fbca88 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Wed, 7 Sep 2022 14:57:53 +0700 Subject: [PATCH 11/19] fix(common/models): backspacing shouldn't make 'new' tokens --- .../web/lm-worker/src/correction/context-tracker.ts | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index fba88261688..b670f1dd4ad 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -127,7 +127,6 @@ namespace correction { } else { this.searchSpace = []; } - token.newFlag = true; this.tokens.push(token); let state = this; @@ -428,6 +427,7 @@ namespace correction { pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : []; } + pushedToken.newFlag = true; state.pushTail(pushedToken); } else { // We're editing the final context token. // TODO: Assumption: we didn't 'miss' any inputs somehow. @@ -448,6 +448,7 @@ namespace correction { let token = new TrackedContextToken(); token.raw = tokenizedContext[0]; token.transformDistributions = [transformDistribution]; + token.newFlag = true; state.pushTail(token); } else { // Edit the lone context token. // Consider backspace entry for this case? @@ -497,15 +498,11 @@ namespace correction { state.pushTail(token); } - for(let i = 0; i < state.tokens.length - 1; i++) { - state.tokens[i].newFlag = false; - } - const finalToken = state.tokens[state.tokens.length - 1]; const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null; - if(!baseTransform || baseTransform.sample.insert != finalToken.raw) { - finalToken.newFlag = false; + if(baseTransform && baseTransform.sample.insert == finalToken.raw) { + finalToken.newFlag = true; } return state; From 52cb9aa19d944f53f15838dace9c156d81984ce7 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Thu, 8 Sep 2022 10:48:12 +0700 Subject: [PATCH 12/19] change(common/models): drops .isNew, replaces with tokenized context contrast logic --- .../src/correction/context-tracker.ts | 15 --- common/web/lm-worker/src/model-compositor.ts | 106 ++++++++++++------ 2 files changed, 72 insertions(+), 49 deletions(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index b670f1dd4ad..f6104ef921f 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -27,16 +27,11 @@ namespace correction { export class TrackedContextToken { raw: string; replacementText: string; - newFlag: boolean = false; transformDistributions: Distribution[] = []; replacements: TrackedContextSuggestion[]; activeReplacementId: number = -1; - get isNew(): boolean { - return this.newFlag; - } - get currentText(): string { if(this.replacementText === undefined || this.replacementText === null) { return this.raw; @@ -190,7 +185,6 @@ namespace correction { } // Replace old token's raw-text with new token's raw-text. editedToken.raw = tokenText; - editedToken.newFlag = false; } toRawTokenization() { @@ -427,7 +421,6 @@ namespace correction { pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : []; } - pushedToken.newFlag = true; state.pushTail(pushedToken); } else { // We're editing the final context token. // TODO: Assumption: we didn't 'miss' any inputs somehow. @@ -448,7 +441,6 @@ namespace correction { let token = new TrackedContextToken(); token.raw = tokenizedContext[0]; token.transformDistributions = [transformDistribution]; - token.newFlag = true; state.pushTail(token); } else { // Edit the lone context token. // Consider backspace entry for this case? @@ -498,13 +490,6 @@ namespace correction { state.pushTail(token); } - const finalToken = state.tokens[state.tokens.length - 1]; - const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null; - - if(baseTransform && baseTransform.sample.insert == finalToken.raw) { - finalToken.newFlag = true; - } - return state; } diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts index 8709ab0c6bb..9a7da82a803 100644 --- a/common/web/lm-worker/src/model-compositor.ts +++ b/common/web/lm-worker/src/model-compositor.ts @@ -85,7 +85,7 @@ class ModelCompositor { // Used to restore whitespaces if operations would remove them. let prefixTransform: Transform; - let contextState: correction.TrackedContextState = null; + let postContextState: correction.TrackedContextState = null; // Section 1: determining 'prediction roots'. if(!this.contextTracker) { @@ -120,12 +120,15 @@ class ModelCompositor { // Running in bulk over all suggestions, duplicate entries may be possible. rawPredictions = this.predictFromCorrections(predictionRoots, context); } else { - contextState = this.contextTracker.analyzeState(this.lexicalModel, - postContext, - !TransformUtils.isEmpty(inputTransform) ? - transformDistribution: - null - ); + // Token replacement benefits greatly from knowledge of the prior context state. + let contextState = this.contextTracker.analyzeState(this.lexicalModel, context, null); + // Corrections and predictions are based upon the post-context state, though. + postContextState = this.contextTracker.analyzeState(this.lexicalModel, + postContext, + !TransformUtils.isEmpty(inputTransform) ? + transformDistribution: + null + ); // TODO: Should we filter backspaces & whitespaces out of the transform distribution? // Ideally, the answer (in the future) will be no, but leaving it in right now may pose an issue. @@ -134,20 +137,68 @@ class ModelCompositor { // let's just note that right now, there will only ever be one. // // The 'eventual' logic will be significantly more complex, though still manageable. - let searchSpace = contextState.searchSpace[0]; - - let newToken = false; - // Detect if we're starting a new context state. - let contextTokens = contextState.tokens; - if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) { - // Always note if we have a new token (so that we don't try to delete existing context) - newToken = true; - // If the new token is due to whitespace, or if we had a context-reset trigger this (thus, no input...) - // (Lingering question: do we need the .isEmpty check here? Track `prefixTransform` and find out.) - if(TransformUtils.isEmpty(inputTransform) || TransformUtils.isWhitespace(inputTransform)) { + let searchSpace = postContextState.searchSpace[0]; + + // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the + // same amount of text. We can handle this before the big 'prediction root' loop. + let deleteLeft = 0; + + // The amount of text to 'replace' depends upon whatever sort of context change occurs + // from the received input. + let postContextLength = postContextState.tokens.length; + let contextLengthDelta = postContextState.tokens.length - contextState.tokens.length; + // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist. + if(postContextLength == 0 || contextLengthDelta > 0) { + // As the word/token being corrected/predicted didn't originally exist, there's no + // part of it to 'replace'. + deleteLeft = 0; + + // If the new token is due to whitespace or due to a different input type that would + // likely imply a tokenization boundary... + if(TransformUtils.isWhitespace(inputTransform)) { + /* TODO: consider/implement: the second half of the comment above. + * For example: on input of a `'`, predict new words instead of replacing the `'`. + * (since after a letter, the `'` will be ignored, anyway) + * + * Idea: if the model's most likely prediction (with no root) would make a new + * token if appended to the current token, that's probably a good case. + * Keeps the check simple & quick. + * + * Might need a mixed mode, though: ';' is close enough that `l` is a reasonable + * fat-finger guess. So yeah, we're not addressing this idea right now. + * - so... consider multiple context behavior angles when building prediction roots? + * + * May need something similar to help handle contractions during their construction, + * but that'd be within `ContextTracker`. + * can' => [`can`, `'`] + * can't => [`can't`] (WB6, 7 of https://unicode.org/reports/tr29/#Word_Boundary_Rules) + * + * (Would also helps WB7b+c for Hebrew text) + */ + + // Infer 'new word' mode, even if we received new text when reaching + // this position. That new text didn't exist before, so still - nothing + // to 'replace'. prefixTransform = inputTransform; - context = postContext; // Ensure the whitespace token is preapplied! + context = postContext; // As far as predictions are concerned, the post-context state + // should not be replaced. Predictions are to be rooted on + // text "up for correction" - so we want a null root for this + // branch. + contextState = postContextState; } + // If the tokenized context length is shorter... sounds like a backspace (or similar). + } else if (contextLengthDelta < 0) { + /* Ooh, we've dropped context here. Almost certainly from a backspace. + * Even if we drop multiple tokens... well, we know exactly how many chars + * were actually deleted - `inputTransform.deleteLeft`. + * Since we replace a word being corrected/predicted, we take length of the remaining + * context's tail token in addition to however far was deleted to reach that state. + */ + deleteLeft = this.wordbreak(postContext).kmwLength() + inputTransform.deleteLeft; + } else { + // Suggestions are applied to the pre-input context, so get the token's original length. + // We're on the same token, so just delete its text for the replacement op. + deleteLeft = this.wordbreak(context).kmwLength(); } // TODO: whitespace, backspace filtering. Do it here. @@ -171,19 +222,6 @@ class ModelCompositor { finalInput = inputTransform; // A fallback measure. Greatly matters for empty contexts. } - let deleteLeft = 0; - // remove actual token string. If new token, there should be nothing to delete. - if(!newToken) { - // If this is triggered from a backspace, make sure to use its results - // and also include its left-deletions! It's the one post-input context case. - if(allowBksp) { - deleteLeft = this.wordbreak(postContext).kmwLength() + inputTransform.deleteLeft; - } else { - // Normal case - use the pre-input context. - deleteLeft = this.wordbreak(context).kmwLength(); - } - } - // Replace the existing context with the correction. let correctionTransform: Transform = { insert: correction, // insert correction string @@ -390,8 +428,8 @@ class ModelCompositor { // Store the suggestions on the final token of the current context state (if it exists). // Or, once phrase-level suggestions are possible, on whichever token serves as each prediction's root. - if(contextState) { - contextState.tail.replacements = suggestions.map(function(suggestion) { + if(postContextState) { + postContextState.tail.replacements = suggestions.map(function(suggestion) { return { suggestion: suggestion, tokenWidth: 1 From 3aed0bf39556a2f7431b1c17bc2655efa7bc152c Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Thu, 8 Sep 2022 10:48:23 +0700 Subject: [PATCH 13/19] feat(common/models): also, unit tests --- .../headless/worker-model-compositor.js | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/common/predictive-text/unit_tests/headless/worker-model-compositor.js b/common/predictive-text/unit_tests/headless/worker-model-compositor.js index f7e9173cee1..dc8a36e9dc0 100644 --- a/common/predictive-text/unit_tests/headless/worker-model-compositor.js +++ b/common/predictive-text/unit_tests/headless/worker-model-compositor.js @@ -66,12 +66,53 @@ describe('ModelCompositor', function() { // Suggestions always delete the full root of the suggestion. // // After a backspace, that means the text 'the' - 3 chars. - // Char 4 is for the original backspace, as suggstions are built + // Char 4 is for the original backspace, as suggestions are built // based on the context state BEFORE the triggering input - // here, a backspace. assert.equal(suggestion.transform.deleteLeft, 4); }); }); + + it('properly handles suggestions for the first letter after a ` `', function() { + let compositor = new ModelCompositor(plainModel); + let context = { + left: 'the', startOfBuffer: true, endOfBuffer: true, + }; + + let inputTransform = { + insert: ' ', + deleteLeft: 0 + }; + + let suggestions = compositor.predict(inputTransform, context); + suggestions.forEach(function(suggestion) { + // After a space, predictions are based on a new, zero-length root. + // With nothing to replace, .deleteLeft should be zero. + assert.equal(suggestion.transform.deleteLeft, 0); + }); + }); + + it('properly handles suggestions for the first letter after a `\'`', function() { + let compositor = new ModelCompositor(plainModel); + let context = { + left: "the '", startOfBuffer: true, endOfBuffer: true, + }; + + // This results in a new word boundary (between the `'` and the `a`). + // Basically, an implied (but nonexistent) ` `. + let inputTransform = { + insert: "a", + deleteLeft: 0 + }; + + let suggestions = compositor.predict(inputTransform, context); + suggestions.forEach(function(suggestion) { + // Suggestions always delete the full root of the suggestion. + // Which, here, didn't exist before the input. Nothing to + // replace => nothing for the suggestion to delete. + assert.equal(suggestion.transform.deleteLeft, 0); + }); + }); }); describe('applySuggestionCasing', function() { From 75e0b27bf379f80813855d6fb35f12bb26f64c2a Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Thu, 8 Sep 2022 10:54:01 +0700 Subject: [PATCH 14/19] change(web): pushWhitespaceToTail tweak --- common/web/lm-worker/src/correction/context-tracker.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index f6104ef921f..528edc2ce83 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -137,6 +137,8 @@ namespace correction { // Will be needed for phrase-level correction/prediction. if(transformDistribution) { whitespaceToken.transformDistributions = [transformDistribution]; + } else { + whitespaceToken.transformDistributions = []; } whitespaceToken.raw = null; From 6c50de9e9a220f28c49050dae44e6f3e01ef7a18 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Thu, 8 Sep 2022 10:56:32 +0700 Subject: [PATCH 15/19] change(web): conciser version of last commit --- common/web/lm-worker/src/correction/context-tracker.ts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts index 528edc2ce83..e17e201d785 100644 --- a/common/web/lm-worker/src/correction/context-tracker.ts +++ b/common/web/lm-worker/src/correction/context-tracker.ts @@ -135,11 +135,7 @@ namespace correction { // Track the Transform that resulted in the whitespace 'token'. // Will be needed for phrase-level correction/prediction. - if(transformDistribution) { - whitespaceToken.transformDistributions = [transformDistribution]; - } else { - whitespaceToken.transformDistributions = []; - } + whitespaceToken.transformDistributions = transformDistribution ? [transformDistribution] : []; whitespaceToken.raw = null; this.tokens.push(whitespaceToken); From 415201bf15d7cdb429df44a8f86bf2984ba82399 Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 12 Sep 2022 15:03:48 +0700 Subject: [PATCH 16/19] fix(common/models): isWhitespace, adds related unit tests --- .../unit_tests/headless/transform-utils.js | 58 +++++++++++++++++++ common/web/lm-worker/src/transformUtils.ts | 10 +--- 2 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 common/predictive-text/unit_tests/headless/transform-utils.js diff --git a/common/predictive-text/unit_tests/headless/transform-utils.js b/common/predictive-text/unit_tests/headless/transform-utils.js new file mode 100644 index 00000000000..b00a24a1230 --- /dev/null +++ b/common/predictive-text/unit_tests/headless/transform-utils.js @@ -0,0 +1,58 @@ +var assert = require('chai').assert; + +let TransformUtils = require('../../../web/lm-worker/build/intermediate.js').TransformUtils; + +describe('TransformUtils', function () { + describe('isWhitespace', function () { + it("should not match a string containing standard alphabetic characters", function () { + let testTransforms = [{ + insert: "a ", + deleteLeft: 0 + }, { + insert: " a", + deleteLeft: 0 + }, { + insert: "ab", + deleteLeft: 0 + }]; + + testTransforms.forEach((transform) => assert.isFalse(TransformUtils.isWhitespace(transform), `failed with: '${transform.insert}'`)); + }); + + it("should match a simple ' ' transform", function() { + transform = { + insert: " ", + deleteLeft: 0 + }; + + assert.isTrue(TransformUtils.isWhitespace(transform)); + }); + + it("should match a simple ' ' transform with delete-left", function() { + transform = { + insert: " ", + deleteLeft: 1 + }; + + assert.isTrue(TransformUtils.isWhitespace(transform)); + }); + + it("should match a transform consisting of multiple characters of only whitespace", function() { + transform = { + insert: " \n\r\u00a0\t\u2000 ", + deleteLeft: 0 + }; + + assert.isTrue(TransformUtils.isWhitespace(transform)); + }); + + it("stress tests", function() { + transform = { + insert: " \n\r\u00a0\ta\u2000 ", // the 'a' should cause failure. + deleteLeft: 0 + }; + + assert.isFalse(TransformUtils.isWhitespace(transform)); + }); + }); +}); diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts index b91951722f1..8c82f16b3be 100644 --- a/common/web/lm-worker/src/transformUtils.ts +++ b/common/web/lm-worker/src/transformUtils.ts @@ -1,18 +1,14 @@ class TransformUtils { static isWhitespace(transform: Transform): boolean { // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab. - let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i; + const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i; // Filter out null-inserts; their high probability can cause issues. - if(transform.insert == '') { // Can actually register as 'whitespace'. + if(transform.insert == '') { return false; } - let insert = transform.insert; - - insert = insert.replace(whitespaceRemover, ''); - - return insert == ''; + return transform.insert.match(whitespaceRemover) != null; } static isBackspace(transform: Transform): boolean { From 665b149b096735a361ae4dc79f6678198abd872c Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Mon, 12 Sep 2022 15:04:17 +0700 Subject: [PATCH 17/19] fix(common/models): needed export for unit tests --- common/web/lm-worker/src/index.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/web/lm-worker/src/index.ts b/common/web/lm-worker/src/index.ts index 7cc3bcbde1d..e3c16fa2c86 100644 --- a/common/web/lm-worker/src/index.ts +++ b/common/web/lm-worker/src/index.ts @@ -32,6 +32,7 @@ /// /// /// +/// /** * Encapsulates all the state required for the LMLayer's worker thread. @@ -407,6 +408,7 @@ if (typeof module !== 'undefined' && typeof module.exports !== 'undefined') { module.exports['wordBreakers'] = wordBreakers; /// XXX: export the ModelCompositor for testing. module.exports['ModelCompositor'] = ModelCompositor; + module.exports['TransformUtils'] = TransformUtils; } else if (typeof self !== 'undefined' && 'postMessage' in self && 'importScripts' in self) { // Automatically install if we're in a Web Worker. LMLayerWorker.install(self as any); // really, 'as typeof globalThis', but we're currently getting TS errors from use of that. From a390c2fb2ffce809ed4f8cd5289bff172a2b062b Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 14 Sep 2022 13:32:14 +0700 Subject: [PATCH 18/19] chore(common/models): Apply suggestions from code review Co-authored-by: Marc Durdin --- common/web/lm-worker/src/transformUtils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts index 8c82f16b3be..b5aa598b653 100644 --- a/common/web/lm-worker/src/transformUtils.ts +++ b/common/web/lm-worker/src/transformUtils.ts @@ -1,6 +1,6 @@ class TransformUtils { static isWhitespace(transform: Transform): boolean { - // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab. + // Matches a string that is entirely one or more characters with Unicode general property Z* or the following: CR, LF, and Tab. const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i; // Filter out null-inserts; their high probability can cause issues. From 917fce0750ae35026d1dd6c1c0f24eb7e35e4bcd Mon Sep 17 00:00:00 2001 From: "Joshua A. Horton" Date: Thu, 15 Sep 2022 08:20:36 +0700 Subject: [PATCH 19/19] chore(common/models): final requested tweak --- common/web/lm-worker/src/transformUtils.ts | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts index b5aa598b653..cbb2c151fe7 100644 --- a/common/web/lm-worker/src/transformUtils.ts +++ b/common/web/lm-worker/src/transformUtils.ts @@ -2,12 +2,6 @@ class TransformUtils { static isWhitespace(transform: Transform): boolean { // Matches a string that is entirely one or more characters with Unicode general property Z* or the following: CR, LF, and Tab. const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i; - - // Filter out null-inserts; their high probability can cause issues. - if(transform.insert == '') { - return false; - } - return transform.insert.match(whitespaceRemover) != null; }