Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -262,30 +262,8 @@ export class ContextState {
const nonEmptyTail = !tokens[lastIndex].isEmptyToken ? tokens[lastIndex] : tokens[lastIndex - 1];
const appliedSuggestionTransitionId = nonEmptyTail?.appliedTransitionId;

const postContext = transformDistribution?.[0] ? applyTransform(transformDistribution[0].sample, context) : context;

// Note for future: the next line's pattern asserts that there is only one true tokenization.
// We may eventually allow for multiple potential tokenizations (per epic-dict-breaker)
const tokenizedContext = determineModelTokenizer(lexicalModel)(postContext).left;
if(tokenizedContext.length == 0) {
tokenizedContext.push({text: ''});
}
// In which case we could try need to align for each of them, starting from the most likely.

// If we're not at the start of the buffer, we're probably a sliding context.
const isSliding = !this.context.startOfBuffer;

// It's possible the tokenization will remember more of the initial token than is
// actually present in the sliding context window, which imposes a need for a wide-band
// computeDistance 'radius' in the called function.
const alignmentResults = this.tokenization.computeAlignment(tokenizedContext.map((token) => token.text), isSliding, isApplyingSuggestion);

// Stopgap: add tokenized transformSequenceDistribution to the alignment data & use that
// where noted: tagTokens() in context-transition.ts, `determineSuggestionAlignment()`.


const state = new ContextState(applyTransform(trueInput, context), lexicalModel);
state.tokenization = new ContextTokenization(resultTokenization.tokens, alignmentResults, resultTokenization.taillessTrueKeystroke);
state.tokenization = new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke);
state.appliedInput = transformDistribution?.[0].sample;
transition.finalize(state, transformDistribution, resultTokenization.taillessTrueKeystroke);
transition.revertableTransitionId = appliedSuggestionTransitionId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import { KMWString } from '@keymanapp/web-utils';

import { ContextToken } from './context-token.js';
import TransformUtils from '../transformUtils.js';
import { computeAlignment, ContextStateAlignment } from './alignment-helpers.js';
import { computeDistance, EditOperation, EditTuple } from './classical-calculation.js';
import { determineModelTokenizer } from '../model-helpers.js';
import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js';
Expand Down Expand Up @@ -105,12 +104,11 @@ export class ContextTokenization {
* The sequence of tokens in the context represented by this instance.
*/
readonly tokens: ContextToken[];

/**
* The tokenization-transition metadata relating this instance to the most likely
* tokenization from a prior state.
*/
readonly alignment?: ContextStateAlignment;
readonly transitionEdits?: PendingTokenization;

/**
* The portion of edits from the true input keystroke that are not part of the
Expand All @@ -125,21 +123,21 @@ export class ContextTokenization {

constructor(priorToClone: ContextTokenization);
constructor(tokens: ContextToken[]);
constructor(tokens: ContextToken[], alignment: ContextStateAlignment, taillessTrueKeystroke: Transform);
constructor(tokens: ContextToken[], alignment: PendingTokenization, taillessTrueKeystroke: Transform);
constructor(
param1: ContextToken[] | ContextTokenization,
alignment?: ContextStateAlignment,
alignment?: PendingTokenization,
taillessTrueKeystroke?: Transform
) {
if(!(param1 instanceof ContextTokenization)) {
const tokens = param1;
this.tokens = [].concat(tokens);
this.alignment = alignment;
this.transitionEdits = alignment;
this.taillessTrueKeystroke = taillessTrueKeystroke;
} else {
const priorToClone = param1;
this.tokens = priorToClone.tokens.map((entry) => new ContextToken(entry));
this.alignment = {...priorToClone.alignment};
this.transitionEdits = {...priorToClone.transitionEdits};
this.taillessTrueKeystroke = priorToClone.taillessTrueKeystroke;
}
}
Expand Down Expand Up @@ -169,20 +167,6 @@ export class ContextTokenization {
return this.tokens.map(token => token.exampleInput);
}

/**
* Determines the alignment between a new, incoming tokenization source and the
* tokenization modeled by the current instance.
* @param incomingTokenization Raw strings corresponding to the tokenization of the incoming context
* @param isSliding Notes if the context window is full (and sliding-alignment is particularly needed)
* @param noSubVerify When true, this disables inspection of 'substitute' transitions that avoids
* wholesale replacement of the original token.
* @returns Alignment data that details if and how the incoming tokenization aligns with
* the tokenization modeled by this instance.
*/
computeAlignment(incomingTokenization: string[], isSliding: boolean, noSubVerify?: boolean): ContextStateAlignment {
return computeAlignment(this.exampleInput, incomingTokenization, isSliding, noSubVerify);
}

/**
* Applies the specified Transform to the _left-hand_ side of the context in
* order to update and match the current contents of the sliding context
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@ import Reversion = LexicalModelTypes.Reversion;
import Suggestion = LexicalModelTypes.Suggestion;
import Transform = LexicalModelTypes.Transform;


// Mark affected tokens with the applied-suggestion transition ID
// for easy future reference.
const tagTokens = (state: ContextState, suggestion: Suggestion) => {
const alignment = state.tokenization.alignment
const appliedTokenCount = (alignment.canAlign && true) && (alignment.tailEditLength + Math.max(alignment.tailTokenShift, 0));
const inputs = state.tokenization.transitionEdits.inputs;
const appliedTokenCount = inputs[0].sample.size;
const tokens = state.tokenization.tokens;
for(let i = tokens.length - appliedTokenCount; i < tokens.length; i++) {
tokens[i].appliedTransitionId = suggestion.transformId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,17 +342,18 @@ export function determineSuggestionAlignment(
*/
deleteLeft: number
} {
const alignment = transition.final.tokenization.alignment;
const transitionEdits = transition.final.tokenization.transitionEdits;
const context = transition.base.context;
const postContext = transition.final.context;
const inputTransform = transition.inputDistribution[0].sample;
const inputTransformMap = transitionEdits?.inputs[0].sample;
let deleteLeft: number;

// If the context now has more tokens, the token we'll be 'predicting' didn't originally exist.
const wordbreak = determineModelWordbreaker(lexicalModel);

// Is the token under construction newly-constructed / is there no pre-existing root?
if(transition.preservationTransform && alignment?.canAlign && alignment.tailTokenShift > 0) {
if(transition.preservationTransform && inputTransformMap?.has(1)) {
return {
// If the new token is due to whitespace or due to a different input type
// that would likely imply a tokenization boundary, infer 'new word' mode.
Expand All @@ -365,7 +366,7 @@ export function determineSuggestionAlignment(
deleteLeft: 0
};
// If the tokenized context length is shorter... sounds like a backspace (or similar).
} else if (alignment?.canAlign && alignment.tailTokenShift < 0) {
} else if (transitionEdits?.alignment.removedTokenCount > 0) {
/* Ooh, we've dropped context here. Almost certainly from a backspace or
* similar effect. Even if we drop multiple tokens... well, we know exactly
* how many chars were actually deleted - `inputTransform.deleteLeft`. Since
Expand Down
Loading