From ad8c30c6c07b7f5272c1b1e55976a9b146fbbf06 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 10:26:31 +0700
Subject: [PATCH 01/19] fix(web): pred-text context tracking when wordbreak not
 caused by whitespace

---
 .../src/correction/context-tracker.ts         | 80 +++++++++++--------
 common/web/lm-worker/src/model-compositor.ts  | 36 ++-------
 common/web/lm-worker/src/transformUtils.ts    | 25 ++++++
 3 files changed, 79 insertions(+), 62 deletions(-)
 create mode 100644 common/web/lm-worker/src/transformUtils.ts
diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index 52e89bedc87..1a94723546c 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -89,7 +89,7 @@ namespace correction {
           if(token.replacementText) {
             copy.replacementText = token.replacementText;
           }
-  
+
           return copy;
         });
         this.searchSpace = obj.searchSpace;
@@ -139,8 +139,10 @@ namespace correction {
 
       // Track the Transform that resulted in the whitespace 'token'.
       // Will be needed for phrase-level correction/prediction.
-      whitespaceToken.transformDistributions = [transformDistribution]; 
-      
+      if(transformDistribution) {
+        whitespaceToken.transformDistributions = [transformDistribution];
+      }
+
       whitespaceToken.raw = null;
       this.tokens.push(whitespaceToken);
     }
@@ -149,19 +151,19 @@ namespace correction {
      * Used for 14.0's backspace workaround, which flattens all previous Distribution<Transform>
      * entries because of limitations with direct use of backspace transforms.
      * @param tokenText
-     * @param transformId 
+     * @param transformId
      */
     replaceTailForBackspace(tokenText: USVString, transformId: number) {
       this.tokens.pop();
 
       // It's a backspace transform; time for special handling!
       //
-      // For now, with 14.0, we simply compress all remaining Transforms for the token into 
-      // multiple single-char transforms.  Probabalistically modeling BKSP is quite complex, 
+      // For now, with 14.0, we simply compress all remaining Transforms for the token into
+      // multiple single-char transforms.  Probabalistically modeling BKSP is quite complex,
       // so we simplify by assuming everything remaining after a BKSP is 'true' and 'intended' text.
       //
       // Note that we cannot just use a single, monolithic transform at this point b/c
-      // of our current edit-distance optimization strategy; diagonalization is currently... 
+      // of our current edit-distance optimization strategy; diagonalization is currently...
       // not very compatible with that.
       let backspacedTokenContext: Distribution<Transform>[] = textToCharTransforms(tokenText, transformId).map(function(transform) {
         return [{sample: transform, p: 1.0}];
@@ -175,7 +177,7 @@ namespace correction {
 
     updateTail(transformDistribution: Distribution<Transform>, tokenText?: USVString) {
       let editedToken = this.tail;
-      
+
       // Preserve existing text if new text isn't specified.
       tokenText = tokenText || (tokenText === '' ? '' : editedToken.raw);
 
@@ -191,7 +193,7 @@ namespace correction {
 
     toRawTokenization() {
       let sequence: USVString[] = [];
-      
+
       for(let token of this.tokens) {
         // Hide any tokens representing wordbreaks.  (Thinking ahead to phrase-level possibilities)
         if(token.currentText !== null) {
@@ -281,7 +283,7 @@ namespace correction {
     /**
      * Returns items contained within the circular array, ordered from 'oldest' to 'newest' -
      * the same order in which the items will be dequeued.
-     * @param index 
+     * @param index
      */
     item(index: number) {
       if(index >= this.count) {
@@ -294,7 +296,7 @@ namespace correction {
   }
 
   export class ContextTracker extends CircularArray<TrackedContextState> {
-    static attemptMatchContext(tokenizedContext: USVString[], 
+    static attemptMatchContext(tokenizedContext: USVString[],
                                matchState: TrackedContextState,
                                transformDistribution?: Distribution<Transform>,): TrackedContextState {
       // Map the previous tokenized state to an edit-distance friendly version.
@@ -335,7 +337,7 @@ namespace correction {
       }
 
       // Can happen for the first text input after backspace deletes a wordbreaking character,
-      // thus the new input continues a previous word while dropping the empty word after 
+      // thus the new input continues a previous word while dropping the empty word after
       // that prior wordbreaking character.
       //
       // We can't handle it reliably from this match state, but a previous entry (without the empty token)
@@ -353,7 +355,7 @@ namespace correction {
 
       // If we've made it here... success!  We have a context match!
       let state: TrackedContextState;
-      
+
       if(pushedTail) {
         // On suggestion acceptance, we should update the previous final token.
         // We do it first so that the acceptance is replicated in the new TrackedContextState
@@ -376,7 +378,9 @@ namespace correction {
       if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) {
         primaryInput = null;
       }
-      const isBackspace = primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft > 0 && !primaryInput.deleteRight;
+
+      const isWhitespace = primaryInput && TransformUtils.isWhitespace(primaryInput);
+      const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);
       const finalToken = tokenizedContext[tokenizedContext.length-1];
 
       /* Assumption:  This is an adequate check for its two sub-branches.
@@ -388,7 +392,7 @@ namespace correction {
        * - Assumption:  one keystroke may only cause a single token to be appended to the context
        *   - That is, no "reasonable" keystroke would emit a Transform adding two separate word tokens
        *     - For languages using whitespace to word-break, said keystroke would have to include said whitespace to break the assumption.
-       */ 
+       */
 
       // If there is/was more than one context token available...
       if(editPath.length > 1) {
@@ -399,17 +403,29 @@ namespace correction {
 
         // We're adding an additional context token.
         if(pushedTail) {
-          // ASSUMPTION:  any transform that triggers this case is a pure-whitespace Transform, as we
-          //              need a word-break before beginning a new word's context.
-          //              Worth note:  when invalid, the lm-layer already has problems in other aspects too.
-          state.pushWhitespaceToTail(transformDistribution);
-
-          let emptyToken = new TrackedContextToken();
-          emptyToken.raw = '';
-          // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
-          // for the new word (token), so the input keystrokes do not correspond to the new text token.
-          emptyToken.transformDistributions = [];
-          state.pushTail(emptyToken);
+          const tokenizedTail = tokenizedContext[tokenizedContext.length - 1];
+          /*
+           * Common-case:  most transforms that trigger this case are from pure-whitespace Transforms.  MOST.
+           *
+           * Less-common, but noteworthy:  some wordbreaks may occur without whitespace.  Example:
+           * `"o` => ['"', 'o'].  Make sure to double-check against `tokenizedContext`!
+           */
+          let pushedToken = new TrackedContextToken();
+          pushedToken.raw = tokenizedTail;
+
+          if(isWhitespace) {
+            state.pushWhitespaceToTail(transformDistribution);
+            // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
+            // for the new word (token), so the input keystrokes do not correspond to the new text token.
+            pushedToken.transformDistributions = [];
+          } else {
+            state.pushWhitespaceToTail();
+            // Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing
+            // any metadata used to construct the new context state token.
+            pushedToken.transformDistributions = [transformDistribution];
+          }
+
+          state.pushTail(pushedToken);
         } else { // We're editing the final context token.
           // TODO:  Assumption:  we didn't 'miss' any inputs somehow.
           //        As is, may be prone to fragility should the lm-layer's tracked context 'desync' from its host's.
@@ -483,13 +499,13 @@ namespace correction {
      * Compares the current, post-input context against the most recently-seen contexts from previous prediction calls, returning
      * the most information-rich `TrackedContextState` possible.  If a match is found, the state will be annotated with the
      * input information provided to previous prediction calls and persisted correction-search calculations for re-use.
-     * 
-     * @param model 
-     * @param context 
-     * @param mainTransform 
-     * @param transformDistribution 
+     *
+     * @param model
+     * @param context
+     * @param mainTransform
+     * @param transformDistribution
      */
-    analyzeState(model: LexicalModel, 
+    analyzeState(model: LexicalModel,
                  context: Context,
                  transformDistribution?: Distribution<Transform>): TrackedContextState {
       if(!model.traverseFromRoot) {
diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts
index 329460776d8..84687c867d4 100644
--- a/common/web/lm-worker/src/model-compositor.ts
+++ b/common/web/lm-worker/src/model-compositor.ts
@@ -16,30 +16,6 @@ class ModelCompositor {
     this.punctuation = ModelCompositor.determinePunctuationFromModel(lexicalModel);
   }
 
-  protected isWhitespace(transform: Transform): boolean {
-    // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab.
-    let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i;
-
-    // Filter out null-inserts; their high probability can cause issues.
-    if(transform.insert == '') { // Can actually register as 'whitespace'.
-      return false;
-    }
-
-    let insert = transform.insert;
-
-    insert = insert.replace(whitespaceRemover, '');
-
-    return insert == '';
-  }
-
-  protected isBackspace(transform: Transform): boolean {
-    return transform.insert == "" && transform.deleteLeft > 0;
-  }
-
-  protected isEmpty(transform: Transform): boolean {
-    return transform.insert == '' && transform.deleteLeft == 0;
-  }
-
   private predictFromCorrections(corrections: ProbabilityMass<Transform>[], context: Context): Distribution<Suggestion> {
     let returnedPredictions: Distribution<Suggestion> = [];
 
@@ -98,8 +74,8 @@ class ModelCompositor {
     })[0].sample;
 
     // Only allow new-word suggestions if space was the most likely keypress.
-    let allowSpace = this.isWhitespace(inputTransform);
-    let allowBksp = this.isBackspace(inputTransform);
+    let allowSpace = TransformUtils.isWhitespace(inputTransform);
+    let allowBksp = TransformUtils.isBackspace(inputTransform);
 
     let postContext = models.applyTransform(inputTransform, context);
     let keepOptionText = this.wordbreak(postContext);
@@ -146,9 +122,9 @@ class ModelCompositor {
     } else {
       contextState = this.contextTracker.analyzeState(this.lexicalModel,
                                                       postContext,
-                                                      !this.isEmpty(inputTransform) ?
-                                                                    transformDistribution:
-                                                                    null
+                                                      !TransformUtils.isEmpty(inputTransform) ?
+                                                                      transformDistribution:
+                                                                      null
                                                       );
 
       // TODO:  Should we filter backspaces & whitespaces out of the transform distribution?
@@ -164,7 +140,7 @@ class ModelCompositor {
       // Detect if we're starting a new context state.
       let contextTokens = contextState.tokens;
       if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) {
-        if(this.isEmpty(inputTransform) || this.isWhitespace(inputTransform)) {
+        if(TransformUtils.isEmpty(inputTransform) || TransformUtils.isWhitespace(inputTransform)) {
           newEmptyToken = true;
           prefixTransform = inputTransform;
           context = postContext; // Ensure the whitespace token is preapplied!
diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts
new file mode 100644
index 00000000000..8512d4e9a20
--- /dev/null
+++ b/common/web/lm-worker/src/transformUtils.ts
@@ -0,0 +1,25 @@
+class TransformUtils {
+  static isWhitespace(transform: Transform): boolean {
+    // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab.
+    let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i;
+
+    // Filter out null-inserts; their high probability can cause issues.
+    if(transform.insert == '') { // Can actually register as 'whitespace'.
+      return false;
+    }
+
+    let insert = transform.insert;
+
+    insert = insert.replace(whitespaceRemover, '');
+
+    return insert == '';
+  }
+
+  static isBackspace(transform: Transform): boolean {
+    return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight;
+  }
+
+  static isEmpty(transform: Transform): boolean {
+    return transform.insert == '' && transform.deleteLeft == 0;
+  }
+}
\ No newline at end of file

From be562b30225410afa77d65d1fbed58f6257f4784 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 10:37:50 +0700
Subject: [PATCH 02/19] fix(web): missed method references

---
 common/web/lm-worker/src/model-compositor.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts
index 84687c867d4..657a7b1ec10 100644
--- a/common/web/lm-worker/src/model-compositor.ts
+++ b/common/web/lm-worker/src/model-compositor.ts
@@ -100,18 +100,18 @@ class ModelCompositor {
         predictionRoots = [{sample: inputTransform, p: 1.0}];
         prefixTransform = inputTransform;
       } else {
-        predictionRoots = transformDistribution.map(function(alt) {
+        predictionRoots = transformDistribution.map((alt) => {
           let transform = alt.sample;
 
           // Filter out special keys unless they're expected.
-          if(this.isWhitespace(transform) && !allowSpace) {
+          if(TransformUtils.isWhitespace(transform) && !allowSpace) {
             return null;
-          } else if(this.isBackspace(transform) && !allowBksp) {
+          } else if(TransformUtils.isBackspace(transform) && !allowBksp) {
             return null;
           }
 
           return alt;
-        }, this);
+        });
       }
 
       // Remove `null` entries.

From b0184ec98bcf1106119afab36b6b7d09b9db70e1 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 11:08:26 +0700
Subject: [PATCH 03/19] fix(web): adds unit test targeting issue, handler for
 edge case

---
 .../headless/edit-distance/context-tracker.js | 26 ++++++++++++++++---
 .../src/correction/context-tracker.ts         |  2 +-
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
index 6a4148b6695..5960a476584 100644
--- a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
+++ b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
@@ -45,7 +45,7 @@ describe('ContextTracker', function() {
       assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
     });
 
-    it("properly matches and aligns when a 'wordbreak' is added'", function() {
+    it("properly matches and aligns when a 'wordbreak' is added", function() {
       let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
       let transform = {
         insert: ' ',
@@ -56,7 +56,7 @@ describe('ContextTracker', function() {
       let rawTokens = ["an", null, "apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];
 
       let existingState = ContextTracker.modelContextState(existingContext);
-      let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
+      let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
       assert.isNotNull(state);
       assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
 
@@ -65,6 +65,26 @@ describe('ContextTracker', function() {
       assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
     });
 
+    it("properly matches and aligns when an implied 'wordbreak' occurs \"'\"", function() {
+      let existingContext = ["'"];
+      let transform = {
+        insert: 'a',
+        deleteLeft: 0
+      }
+      let newContext = Array.from(existingContext);
+      newContext.push('a'); // The incoming transform should produce a new token WITH TEXT.
+      let rawTokens = ["'", null, "a"];
+
+      let existingState = ContextTracker.modelContextState(existingContext);
+      let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
+      assert.isNotNull(state);
+      assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
+
+      // The 'wordbreak' transform
+      assert.isEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
+      assert.isNotEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
+    });
+
     it("properly matches and aligns when lead token is removed AND a 'wordbreak' is added'", function() {
       let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
       let transform = {
@@ -77,7 +97,7 @@ describe('ContextTracker', function() {
       let rawTokens = ["apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];
 
       let existingState = ContextTracker.modelContextState(existingContext);
-      let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
+      let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
       assert.isNotNull(state);
       assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
 
diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index 1a94723546c..0f50657f0e9 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -413,7 +413,7 @@ namespace correction {
           let pushedToken = new TrackedContextToken();
           pushedToken.raw = tokenizedTail;
 
-          if(isWhitespace) {
+          if(isWhitespace || !primaryInput) {
             state.pushWhitespaceToTail(transformDistribution);
             // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
             // for the new word (token), so the input keystrokes do not correspond to the new text token.

From 0a237f800fbbca7aa78483ae0f3371a71534b52f Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 11:31:57 +0700
Subject: [PATCH 04/19] chore(web): test name tweak

---
 .../unit_tests/headless/edit-distance/context-tracker.js        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
index 5960a476584..fa09ca20dec 100644
--- a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
+++ b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
@@ -65,7 +65,7 @@ describe('ContextTracker', function() {
       assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
     });
 
-    it("properly matches and aligns when an implied 'wordbreak' occurs \"'\"", function() {
+    it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() {
       let existingContext = ["'"];
       let transform = {
         insert: 'a',

From d33a9f05bf999cbc476bb600f50eada8ee934395 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 11:41:42 +0700
Subject: [PATCH 05/19] docs(web): fixes missed doc update

---
 common/web/lm-worker/src/correction/context-tracker.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index 0f50657f0e9..bc187fa03f5 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -502,7 +502,6 @@ namespace correction {
      *
      * @param model
      * @param context
-     * @param mainTransform
      * @param transformDistribution
      */
     analyzeState(model: LexicalModel,

From 57ce30e0685fb6e6c8c3d06207045d4960e2131d Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 12:17:17 +0700
Subject: [PATCH 06/19] fix(web): post-suggestion-apply error

---
 common/web/lm-worker/src/correction/context-tracker.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index bc187fa03f5..99842b8299a 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -414,7 +414,7 @@ namespace correction {
           pushedToken.raw = tokenizedTail;
 
           if(isWhitespace || !primaryInput) {
-            state.pushWhitespaceToTail(transformDistribution);
+            state.pushWhitespaceToTail(transformDistribution ?? []);
             // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
             // for the new word (token), so the input keystrokes do not correspond to the new text token.
             pushedToken.transformDistributions = [];
@@ -422,7 +422,7 @@ namespace correction {
             state.pushWhitespaceToTail();
             // Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing
             // any metadata used to construct the new context state token.
-            pushedToken.transformDistributions = [transformDistribution];
+            pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
           }
 
           state.pushTail(pushedToken);

From b47596a803cc9c65c329d85a33c8c1d3736f74e0 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 12:35:11 +0700
Subject: [PATCH 07/19] fix(common/models): context token .isNew maintenance

---
 common/web/lm-worker/src/correction/context-tracker.ts | 5 ++++-
 common/web/lm-worker/src/model-compositor.ts           | 9 ++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index 99842b8299a..08ca11b1368 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -27,13 +27,14 @@ namespace correction {
   export class TrackedContextToken {
     raw: string;
     replacementText: string;
+    newFlag: boolean = false;
 
     transformDistributions: Distribution<Transform>[] = [];
     replacements: TrackedContextSuggestion[];
     activeReplacementId: number = -1;
 
     get isNew(): boolean {
-      return this.transformDistributions.length == 0;
+      return this.newFlag;
     }
 
     get currentText(): string {
@@ -126,6 +127,7 @@ namespace correction {
       } else {
         this.searchSpace = [];
       }
+      token.newFlag = true;
       this.tokens.push(token);
 
       let state = this;
@@ -189,6 +191,7 @@ namespace correction {
       }
       // Replace old token's raw-text with new token's raw-text.
       editedToken.raw = tokenText;
+      editedToken.newFlag = false;
     }
 
     toRawTokenization() {
diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts
index 657a7b1ec10..6c40d36e981 100644
--- a/common/web/lm-worker/src/model-compositor.ts
+++ b/common/web/lm-worker/src/model-compositor.ts
@@ -136,12 +136,15 @@ class ModelCompositor {
       // The 'eventual' logic will be significantly more complex, though still manageable.
       let searchSpace = contextState.searchSpace[0];
 
-      let newEmptyToken = false;
+      let newToken = false;
       // Detect if we're starting a new context state.
       let contextTokens = contextState.tokens;
       if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) {
+        // Always note if we have a new token (so that we don't try to delete existing context)
+        newToken = true;
+        // If the new token is due to whitespace, or if we had a context-reset trigger this (thus, no input...)
+        // (Lingering question:  do we need the .isEmpty check here?  Track `prefixTransform` and find out.)
         if(TransformUtils.isEmpty(inputTransform) || TransformUtils.isWhitespace(inputTransform)) {
-          newEmptyToken = true;
           prefixTransform = inputTransform;
           context = postContext; // Ensure the whitespace token is preapplied!
         }
@@ -170,7 +173,7 @@ class ModelCompositor {
 
           let deleteLeft = 0;
           // remove actual token string.  If new token, there should be nothing to delete.
-          if(!newEmptyToken) {
+          if(!newToken) {
             // If this is triggered from a backspace, make sure to use its results
             // and also include its left-deletions!  It's the one post-input context case.
             if(allowBksp) {

From 26c5150977cf60de364b6d2ef8aa893668d00aeb Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 5 Sep 2022 14:02:55 +0700
Subject: [PATCH 08/19] fix(web): context-tracker newFlag management for new
 contexts

---
 .../lm-worker/src/correction/context-tracker.ts | 17 +++++++++++++++--
 common/web/lm-worker/src/model-compositor.ts    |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index 08ca11b1368..fba88261688 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -461,7 +461,9 @@ namespace correction {
       return state;
     }
 
-    static modelContextState(tokenizedContext: USVString[], lexicalModel: LexicalModel): TrackedContextState {
+    static modelContextState(tokenizedContext: USVString[],
+                             transformDistribution: Distribution<Transform>,
+                             lexicalModel: LexicalModel): TrackedContextState {
       let baseTokens = tokenizedContext.map(function(entry) {
         let token = new TrackedContextToken();
         token.raw = entry;
@@ -495,6 +497,17 @@ namespace correction {
         state.pushTail(token);
       }
 
+      for(let i = 0; i < state.tokens.length - 1; i++) {
+        state.tokens[i].newFlag = false;
+      }
+
+      const finalToken = state.tokens[state.tokens.length - 1];
+      const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null;
+
+      if(!baseTransform || baseTransform.sample.insert != finalToken.raw) {
+        finalToken.newFlag = false;
+      }
+
       return state;
     }
 
@@ -537,7 +550,7 @@ namespace correction {
       //
       // Assumption:  as a caret needs to move to context before any actual transform distributions occur,
       // this state is only reached on caret moves; thus, transformDistribution is actually just a single null transform.
-      let state = ContextTracker.modelContextState(tokenizedContext.left, model);
+      let state = ContextTracker.modelContextState(tokenizedContext.left, transformDistribution, model);
       state.taggedContext = context;
       this.enqueue(state);
       return state;
diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts
index 6c40d36e981..8709ab0c6bb 100644
--- a/common/web/lm-worker/src/model-compositor.ts
+++ b/common/web/lm-worker/src/model-compositor.ts
@@ -638,7 +638,7 @@ class ModelCompositor {
     // than before.
     if(this.contextTracker) {
       let tokenizedContext = models.tokenize(this.lexicalModel.wordbreaker || wordBreakers.default, context);
-      let contextState = correction.ContextTracker.modelContextState(tokenizedContext.left, this.lexicalModel);
+      let contextState = correction.ContextTracker.modelContextState(tokenizedContext.left, null, this.lexicalModel);
       this.contextTracker.enqueue(contextState);
     }
   }

From 9634c7635340a93babd6bee323941d97d797a54d Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Tue, 6 Sep 2022 08:57:56 +0700
Subject: [PATCH 09/19] chore(common/models): suggested tweak from review

---
 common/web/lm-worker/src/transformUtils.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts
index 8512d4e9a20..3bc66f33ce3 100644
--- a/common/web/lm-worker/src/transformUtils.ts
+++ b/common/web/lm-worker/src/transformUtils.ts
@@ -16,10 +16,10 @@ class TransformUtils {
   }
 
   static isBackspace(transform: Transform): boolean {
-    return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight;
+    return transform.insert == "" && transform.deleteLeft > 0 && transform.deleteRight == 0;
   }
 
   static isEmpty(transform: Transform): boolean {
-    return transform.insert == '' && transform.deleteLeft == 0;
+    return transform.insert == '' && transform.deleteLeft == 0 && transform.deleteRight == 0;
   }
 }
\ No newline at end of file

From 9be8839685ae8d65fbae37ff5eb9ab836f863804 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Tue, 6 Sep 2022 10:47:51 +0700
Subject: [PATCH 10/19] fix(common/models): undefined != 0

---
 common/web/lm-worker/src/transformUtils.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts
index 3bc66f33ce3..b91951722f1 100644
--- a/common/web/lm-worker/src/transformUtils.ts
+++ b/common/web/lm-worker/src/transformUtils.ts
@@ -16,10 +16,10 @@ class TransformUtils {
   }
 
   static isBackspace(transform: Transform): boolean {
-    return transform.insert == "" && transform.deleteLeft > 0 && transform.deleteRight == 0;
+    return transform.insert == "" && transform.deleteLeft > 0 && !transform.deleteRight;
   }
 
   static isEmpty(transform: Transform): boolean {
-    return transform.insert == '' && transform.deleteLeft == 0 && transform.deleteRight == 0;
+    return transform.insert == '' && transform.deleteLeft == 0 && !transform.deleteRight;
   }
 }
\ No newline at end of file

From fb0cc5086cd5f0abbf6d0cac4d166cd691fbca88 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Wed, 7 Sep 2022 14:57:53 +0700
Subject: [PATCH 11/19] fix(common/models): backspacing shouldn't make 'new'
 tokens

---
 .../web/lm-worker/src/correction/context-tracker.ts   | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index fba88261688..b670f1dd4ad 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -127,7 +127,6 @@ namespace correction {
       } else {
         this.searchSpace = [];
       }
-      token.newFlag = true;
       this.tokens.push(token);
 
       let state = this;
@@ -428,6 +427,7 @@ namespace correction {
             pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
           }
 
+          pushedToken.newFlag = true;
           state.pushTail(pushedToken);
         } else { // We're editing the final context token.
           // TODO:  Assumption:  we didn't 'miss' any inputs somehow.
@@ -448,6 +448,7 @@ namespace correction {
           let token = new TrackedContextToken();
           token.raw = tokenizedContext[0];
           token.transformDistributions = [transformDistribution];
+          token.newFlag = true;
           state.pushTail(token);
         } else { // Edit the lone context token.
           // Consider backspace entry for this case?
@@ -497,15 +498,11 @@ namespace correction {
         state.pushTail(token);
       }
 
-      for(let i = 0; i < state.tokens.length - 1; i++) {
-        state.tokens[i].newFlag = false;
-      }
-
       const finalToken = state.tokens[state.tokens.length - 1];
       const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null;
 
-      if(!baseTransform || baseTransform.sample.insert != finalToken.raw) {
-        finalToken.newFlag = false;
+      if(baseTransform && baseTransform.sample.insert == finalToken.raw) {
+        finalToken.newFlag = true;
       }
 
       return state;

From 52cb9aa19d944f53f15838dace9c156d81984ce7 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Thu, 8 Sep 2022 10:48:12 +0700
Subject: [PATCH 12/19] change(common/models): drops .isNew, replaces with
 tokenized context contrast logic

---
 .../src/correction/context-tracker.ts         |  15 ---
 common/web/lm-worker/src/model-compositor.ts  | 106 ++++++++++++------
 2 files changed, 72 insertions(+), 49 deletions(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index b670f1dd4ad..f6104ef921f 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -27,16 +27,11 @@ namespace correction {
   export class TrackedContextToken {
     raw: string;
     replacementText: string;
-    newFlag: boolean = false;
 
     transformDistributions: Distribution<Transform>[] = [];
     replacements: TrackedContextSuggestion[];
     activeReplacementId: number = -1;
 
-    get isNew(): boolean {
-      return this.newFlag;
-    }
-
     get currentText(): string {
       if(this.replacementText === undefined || this.replacementText === null) {
         return this.raw;
@@ -190,7 +185,6 @@ namespace correction {
       }
       // Replace old token's raw-text with new token's raw-text.
       editedToken.raw = tokenText;
-      editedToken.newFlag = false;
     }
 
     toRawTokenization() {
@@ -427,7 +421,6 @@ namespace correction {
             pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
           }
 
-          pushedToken.newFlag = true;
           state.pushTail(pushedToken);
         } else { // We're editing the final context token.
           // TODO:  Assumption:  we didn't 'miss' any inputs somehow.
@@ -448,7 +441,6 @@ namespace correction {
           let token = new TrackedContextToken();
           token.raw = tokenizedContext[0];
           token.transformDistributions = [transformDistribution];
-          token.newFlag = true;
           state.pushTail(token);
         } else { // Edit the lone context token.
           // Consider backspace entry for this case?
@@ -498,13 +490,6 @@ namespace correction {
         state.pushTail(token);
       }
 
-      const finalToken = state.tokens[state.tokens.length - 1];
-      const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null;
-
-      if(baseTransform && baseTransform.sample.insert == finalToken.raw) {
-        finalToken.newFlag = true;
-      }
-
       return state;
     }
 
diff --git a/common/web/lm-worker/src/model-compositor.ts b/common/web/lm-worker/src/model-compositor.ts
index 8709ab0c6bb..9a7da82a803 100644
--- a/common/web/lm-worker/src/model-compositor.ts
+++ b/common/web/lm-worker/src/model-compositor.ts
@@ -85,7 +85,7 @@ class ModelCompositor {
 
     // Used to restore whitespaces if operations would remove them.
     let prefixTransform: Transform;
-    let contextState: correction.TrackedContextState = null;
+    let postContextState: correction.TrackedContextState = null;
 
     // Section 1:  determining 'prediction roots'.
     if(!this.contextTracker) {
@@ -120,12 +120,15 @@ class ModelCompositor {
       // Running in bulk over all suggestions, duplicate entries may be possible.
       rawPredictions = this.predictFromCorrections(predictionRoots, context);
     } else {
-      contextState = this.contextTracker.analyzeState(this.lexicalModel,
-                                                      postContext,
-                                                      !TransformUtils.isEmpty(inputTransform) ?
-                                                                      transformDistribution:
-                                                                      null
-                                                      );
+      // Token replacement benefits greatly from knowledge of the prior context state.
+      let contextState = this.contextTracker.analyzeState(this.lexicalModel, context, null);
+      // Corrections and predictions are based upon the post-context state, though.
+      postContextState = this.contextTracker.analyzeState(this.lexicalModel,
+                                                          postContext,
+                                                          !TransformUtils.isEmpty(inputTransform) ?
+                                                                          transformDistribution:
+                                                                          null
+                                                          );
 
       // TODO:  Should we filter backspaces & whitespaces out of the transform distribution?
       //        Ideally, the answer (in the future) will be no, but leaving it in right now may pose an issue.
@@ -134,20 +137,68 @@ class ModelCompositor {
       // let's just note that right now, there will only ever be one.
       //
       // The 'eventual' logic will be significantly more complex, though still manageable.
-      let searchSpace = contextState.searchSpace[0];
-
-      let newToken = false;
-      // Detect if we're starting a new context state.
-      let contextTokens = contextState.tokens;
-      if(contextTokens.length == 0 || contextTokens[contextTokens.length - 1].isNew) {
-        // Always note if we have a new token (so that we don't try to delete existing context)
-        newToken = true;
-        // If the new token is due to whitespace, or if we had a context-reset trigger this (thus, no input...)
-        // (Lingering question:  do we need the .isEmpty check here?  Track `prefixTransform` and find out.)
-        if(TransformUtils.isEmpty(inputTransform) || TransformUtils.isWhitespace(inputTransform)) {
+      let searchSpace = postContextState.searchSpace[0];
+
+      // No matter the prediction, once we know the root of the prediction, we'll always 'replace' the
+      // same amount of text.  We can handle this before the big 'prediction root' loop.
+      let deleteLeft = 0;
+
+      // The amount of text to 'replace' depends upon whatever sort of context change occurs
+      // from the received input.
+      let postContextLength = postContextState.tokens.length;
+      let contextLengthDelta = postContextState.tokens.length - contextState.tokens.length;
+      // If the context now has more tokens, the token we'll be 'predicting' didn't originally exist.
+      if(postContextLength == 0 || contextLengthDelta > 0) {
+        // As the word/token being corrected/predicted didn't originally exist, there's no
+        // part of it to 'replace'.
+        deleteLeft = 0;
+
+        // If the new token is due to whitespace or due to a different input type that would
+        // likely imply a tokenization boundary...
+        if(TransformUtils.isWhitespace(inputTransform)) {
+          /* TODO:  consider/implement:  the second half of the comment above.
+           * For example:  on input of a `'`, predict new words instead of replacing the `'`.
+           * (since after a letter, the `'` will be ignored, anyway)
+           *
+           * Idea:  if the model's most likely prediction (with no root) would make a new
+           * token if appended to the current token, that's probably a good case.
+           * Keeps the check simple & quick.
+           *
+           * Might need a mixed mode, though:  ';' is close enough that `l` is a reasonable
+           * fat-finger guess.   So yeah, we're not addressing this idea right now.
+           * - so... consider multiple context behavior angles when building prediction roots?
+           *
+           * May need something similar to help handle contractions during their construction,
+           * but that'd be within `ContextTracker`.
+           * can' => [`can`, `'`]
+           * can't => [`can't`]  (WB6, 7 of https://unicode.org/reports/tr29/#Word_Boundary_Rules)
+           *
+           * (Would also helps WB7b+c for Hebrew text)
+           */
+
+          // Infer 'new word' mode, even if we received new text when reaching
+          // this position.  That new text didn't exist before, so still - nothing
+          // to 'replace'.
           prefixTransform = inputTransform;
-          context = postContext; // Ensure the whitespace token is preapplied!
+          context = postContext; // As far as predictions are concerned, the post-context state
+                                 // should not be replaced.  Predictions are to be rooted on
+                                 // text "up for correction" - so we want a null root for this
+                                 // branch.
+          contextState = postContextState;
         }
+        // If the tokenized context length is shorter... sounds like a backspace (or similar).
+      } else if (contextLengthDelta < 0) {
+        /* Ooh, we've dropped context here.  Almost certainly from a backspace.
+         * Even if we drop multiple tokens... well, we know exactly how many chars
+         * were actually deleted - `inputTransform.deleteLeft`.
+         * Since we replace a word being corrected/predicted, we take length of the remaining
+         * context's tail token in addition to however far was deleted to reach that state.
+         */
+        deleteLeft = this.wordbreak(postContext).kmwLength() + inputTransform.deleteLeft;
+      } else {
+        // Suggestions are applied to the pre-input context, so get the token's original length.
+        // We're on the same token, so just delete its text for the replacement op.
+        deleteLeft = this.wordbreak(context).kmwLength();
       }
 
       // TODO:  whitespace, backspace filtering.  Do it here.
@@ -171,19 +222,6 @@ class ModelCompositor {
             finalInput = inputTransform;  // A fallback measure.  Greatly matters for empty contexts.
           }
 
-          let deleteLeft = 0;
-          // remove actual token string.  If new token, there should be nothing to delete.
-          if(!newToken) {
-            // If this is triggered from a backspace, make sure to use its results
-            // and also include its left-deletions!  It's the one post-input context case.
-            if(allowBksp) {
-              deleteLeft = this.wordbreak(postContext).kmwLength() + inputTransform.deleteLeft;
-            } else {
-              // Normal case - use the pre-input context.
-              deleteLeft = this.wordbreak(context).kmwLength();
-            }
-          }
-
           // Replace the existing context with the correction.
           let correctionTransform: Transform = {
             insert: correction,  // insert correction string
@@ -390,8 +428,8 @@ class ModelCompositor {
 
     // Store the suggestions on the final token of the current context state (if it exists).
     // Or, once phrase-level suggestions are possible, on whichever token serves as each prediction's root.
-    if(contextState) {
-      contextState.tail.replacements = suggestions.map(function(suggestion) {
+    if(postContextState) {
+      postContextState.tail.replacements = suggestions.map(function(suggestion) {
         return {
           suggestion: suggestion,
           tokenWidth: 1

From 3aed0bf39556a2f7431b1c17bc2655efa7bc152c Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Thu, 8 Sep 2022 10:48:23 +0700
Subject: [PATCH 13/19] feat(common/models): also, unit tests

---
 .../headless/worker-model-compositor.js       | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/common/predictive-text/unit_tests/headless/worker-model-compositor.js b/common/predictive-text/unit_tests/headless/worker-model-compositor.js
index f7e9173cee1..dc8a36e9dc0 100644
--- a/common/predictive-text/unit_tests/headless/worker-model-compositor.js
+++ b/common/predictive-text/unit_tests/headless/worker-model-compositor.js
@@ -66,12 +66,53 @@ describe('ModelCompositor', function() {
           // Suggestions always delete the full root of the suggestion.
           //
           // After a backspace, that means the text 'the' - 3 chars.
-          // Char 4 is for the original backspace, as suggstions are built
+          // Char 4 is for the original backspace, as suggestions are built
           // based on the context state BEFORE the triggering input -
           // here, a backspace.
           assert.equal(suggestion.transform.deleteLeft, 4);
         });
       });
+
+      it('properly handles suggestions for the first letter after a ` `', function() {
+        let compositor = new ModelCompositor(plainModel);
+        let context = {
+          left: 'the', startOfBuffer: true, endOfBuffer: true,
+        };
+
+        let inputTransform = {
+          insert: ' ',
+          deleteLeft: 0
+        };
+
+        let suggestions = compositor.predict(inputTransform, context);
+        suggestions.forEach(function(suggestion) {
+          // After a space, predictions are based on a new, zero-length root.
+          // With nothing to replace, .deleteLeft should be zero.
+          assert.equal(suggestion.transform.deleteLeft, 0);
+        });
+      });
+
+      it('properly handles suggestions for the first letter after a `\'`', function() {
+        let compositor = new ModelCompositor(plainModel);
+        let context = {
+          left: "the '", startOfBuffer: true, endOfBuffer: true,
+        };
+
+        // This results in a new word boundary (between the `'` and the `a`).
+        // Basically, an implied (but nonexistent) ` `.
+        let inputTransform = {
+          insert: "a",
+          deleteLeft: 0
+        };
+
+        let suggestions = compositor.predict(inputTransform, context);
+        suggestions.forEach(function(suggestion) {
+          // Suggestions always delete the full root of the suggestion.
+          // Which, here, didn't exist before the input.  Nothing to
+          // replace => nothing for the suggestion to delete.
+          assert.equal(suggestion.transform.deleteLeft, 0);
+        });
+      });
     });
 
     describe('applySuggestionCasing', function() {

From 75e0b27bf379f80813855d6fb35f12bb26f64c2a Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Thu, 8 Sep 2022 10:54:01 +0700
Subject: [PATCH 14/19] change(web): pushWhitespaceToTail tweak

---
 common/web/lm-worker/src/correction/context-tracker.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index f6104ef921f..528edc2ce83 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -137,6 +137,8 @@ namespace correction {
       // Will be needed for phrase-level correction/prediction.
       if(transformDistribution) {
         whitespaceToken.transformDistributions = [transformDistribution];
+      } else {
+        whitespaceToken.transformDistributions = [];
       }
 
       whitespaceToken.raw = null;

From 6c50de9e9a220f28c49050dae44e6f3e01ef7a18 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Thu, 8 Sep 2022 10:56:32 +0700
Subject: [PATCH 15/19] change(web): conciser version of last commit

---
 common/web/lm-worker/src/correction/context-tracker.ts | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
index 528edc2ce83..e17e201d785 100644
--- a/common/web/lm-worker/src/correction/context-tracker.ts
+++ b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -135,11 +135,7 @@ namespace correction {
 
       // Track the Transform that resulted in the whitespace 'token'.
       // Will be needed for phrase-level correction/prediction.
-      if(transformDistribution) {
-        whitespaceToken.transformDistributions = [transformDistribution];
-      } else {
-        whitespaceToken.transformDistributions = [];
-      }
+      whitespaceToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
 
       whitespaceToken.raw = null;
       this.tokens.push(whitespaceToken);

From 415201bf15d7cdb429df44a8f86bf2984ba82399 Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 12 Sep 2022 15:03:48 +0700
Subject: [PATCH 16/19] fix(common/models): isWhitespace, adds related unit
 tests

---
 .../unit_tests/headless/transform-utils.js    | 58 +++++++++++++++++++
 common/web/lm-worker/src/transformUtils.ts    | 10 +---
 2 files changed, 61 insertions(+), 7 deletions(-)
 create mode 100644 common/predictive-text/unit_tests/headless/transform-utils.js

diff --git a/common/predictive-text/unit_tests/headless/transform-utils.js b/common/predictive-text/unit_tests/headless/transform-utils.js
new file mode 100644
index 00000000000..b00a24a1230
--- /dev/null
+++ b/common/predictive-text/unit_tests/headless/transform-utils.js
@@ -0,0 +1,58 @@
+var assert = require('chai').assert;
+
+let TransformUtils = require('../../../web/lm-worker/build/intermediate.js').TransformUtils;
+
+describe('TransformUtils', function () {
+  describe('isWhitespace', function () {
+    it("should not match a string containing standard alphabetic characters", function () {
+      let testTransforms = [{
+        insert: "a ",
+        deleteLeft: 0
+      }, {
+        insert: " a",
+        deleteLeft: 0
+      }, {
+        insert: "ab",
+        deleteLeft: 0
+      }];
+
+      testTransforms.forEach((transform) => assert.isFalse(TransformUtils.isWhitespace(transform), `failed with: '${transform.insert}'`));
+    });
+
+    it("should match a simple ' ' transform", function() {
+      transform = {
+        insert: " ",
+        deleteLeft: 0
+      };
+
+      assert.isTrue(TransformUtils.isWhitespace(transform));
+    });
+
+    it("should match a simple ' ' transform with delete-left", function() {
+      transform = {
+        insert: " ",
+        deleteLeft: 1
+      };
+
+      assert.isTrue(TransformUtils.isWhitespace(transform));
+    });
+
+    it("should match a transform consisting of multiple characters of only whitespace", function() {
+      transform = {
+        insert: " \n\r\u00a0\t\u2000 ",
+        deleteLeft: 0
+      };
+
+      assert.isTrue(TransformUtils.isWhitespace(transform));
+    });
+
+    it("stress tests", function() {
+      transform = {
+        insert: " \n\r\u00a0\ta\u2000 ",  // the 'a' should cause failure.
+        deleteLeft: 0
+      };
+
+      assert.isFalse(TransformUtils.isWhitespace(transform));
+    });
+  });
+});
diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts
index b91951722f1..8c82f16b3be 100644
--- a/common/web/lm-worker/src/transformUtils.ts
+++ b/common/web/lm-worker/src/transformUtils.ts
@@ -1,18 +1,14 @@
 class TransformUtils {
   static isWhitespace(transform: Transform): boolean {
     // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab.
-    let whitespaceRemover = /.*[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]/i;
+    const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i;
 
     // Filter out null-inserts; their high probability can cause issues.
-    if(transform.insert == '') { // Can actually register as 'whitespace'.
+    if(transform.insert == '') {
       return false;
     }
 
-    let insert = transform.insert;
-
-    insert = insert.replace(whitespaceRemover, '');
-
-    return insert == '';
+    return transform.insert.match(whitespaceRemover) != null;
   }
 
   static isBackspace(transform: Transform): boolean {

From 665b149b096735a361ae4dc79f6678198abd872c Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Mon, 12 Sep 2022 15:04:17 +0700
Subject: [PATCH 17/19] fix(common/models): needed export for unit tests

---
 common/web/lm-worker/src/index.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common/web/lm-worker/src/index.ts b/common/web/lm-worker/src/index.ts
index 7cc3bcbde1d..e3c16fa2c86 100644
--- a/common/web/lm-worker/src/index.ts
+++ b/common/web/lm-worker/src/index.ts
@@ -32,6 +32,7 @@
 /// <reference types="@keymanapp/lm-message-types" />
 /// <reference path="./models/dummy-model.ts" />
 /// <reference path="./model-compositor.ts" />
+/// <reference path="./transformUtils.ts" />
 
 /**
  * Encapsulates all the state required for the LMLayer's worker thread.
@@ -407,6 +408,7 @@ if (typeof module !== 'undefined' && typeof module.exports !== 'undefined') {
   module.exports['wordBreakers'] = wordBreakers;
   /// XXX: export the ModelCompositor for testing.
   module.exports['ModelCompositor'] = ModelCompositor;
+  module.exports['TransformUtils'] = TransformUtils;
 } else if (typeof self !== 'undefined' && 'postMessage' in self && 'importScripts' in self) {
   // Automatically install if we're in a Web Worker.
   LMLayerWorker.install(self as any); // really, 'as typeof globalThis', but we're currently getting TS errors from use of that.

From a390c2fb2ffce809ed4f8cd5289bff172a2b062b Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Wed, 14 Sep 2022 13:32:14 +0700
Subject: [PATCH 18/19] chore(common/models): Apply suggestions from code
 review

Co-authored-by: Marc Durdin <marc@durdin.net>
---
 common/web/lm-worker/src/transformUtils.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts
index 8c82f16b3be..b5aa598b653 100644
--- a/common/web/lm-worker/src/transformUtils.ts
+++ b/common/web/lm-worker/src/transformUtils.ts
@@ -1,6 +1,6 @@
 class TransformUtils {
   static isWhitespace(transform: Transform): boolean {
-    // Matches prefixed text + any instance of a character with Unicode general property Z* or the following: CR, LF, and Tab.
+    // Matches a string that is entirely one or more characters with Unicode general property Z* or the following: CR, LF, and Tab.
     const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i;
 
     // Filter out null-inserts; their high probability can cause issues.

From 917fce0750ae35026d1dd6c1c0f24eb7e35e4bcd Mon Sep 17 00:00:00 2001
From: "Joshua A. Horton" <joshua_horton@sil.org>
Date: Thu, 15 Sep 2022 08:20:36 +0700
Subject: [PATCH 19/19] chore(common/models): final requested tweak

---
 common/web/lm-worker/src/transformUtils.ts | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/common/web/lm-worker/src/transformUtils.ts b/common/web/lm-worker/src/transformUtils.ts
index b5aa598b653..cbb2c151fe7 100644
--- a/common/web/lm-worker/src/transformUtils.ts
+++ b/common/web/lm-worker/src/transformUtils.ts
@@ -2,12 +2,6 @@ class TransformUtils {
   static isWhitespace(transform: Transform): boolean {
     // Matches a string that is entirely one or more characters with Unicode general property Z* or the following: CR, LF, and Tab.
     const whitespaceRemover = /^[\u0009\u000A\u000D\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000]+$/i;
-
-    // Filter out null-inserts; their high probability can cause issues.
-    if(transform.insert == '') {
-      return false;
-    }
-
     return transform.insert.match(whitespaceRemover) != null;
   }