Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ import { ContextToken } from './context-token.js';
import { ContextTokenization } from './context-tokenization.js';
import { ContextTransition } from './context-transition.js';
import { determineModelTokenizer } from '../model-helpers.js';
import { tokenizeAndFilterDistribution } from './transform-tokenization.js';
import { legacySubsetKeyer, TokenizationSubsetBuilder } from './tokenization-subsets.js';
import TransformUtils from '../transformUtils.js';

import Context = LexicalModelTypes.Context;
import Distribution = LexicalModelTypes.Distribution;
Expand Down Expand Up @@ -197,57 +198,58 @@ export class ContextState {
): ContextTransition {
const lexicalModel = this.model;

// Apply all transforms to the base context state
const transformSequenceDistribution = tokenizeAndFilterDistribution(context, lexicalModel, transformDistribution);
const postContext = transformDistribution?.[0] ? applyTransform(transformDistribution[0].sample, context) : context;

// Note for future: the next line's pattern asserts that there is only one true tokenization.
// We may eventually allow for multiple potential tokenizations (per epic-dict-breaker)
const tokenizedContext = determineModelTokenizer(lexicalModel)(postContext).left;
if(tokenizedContext.length == 0) {
tokenizedContext.push({text: ''});
}
// In which case we could try need to align for each of them, starting from the most likely.

// If we're not at the start of the buffer, we're probably a sliding context.
const isSliding = !this.context.startOfBuffer;

// It's possible the tokenization will remember more of the initial token than is
// actually present in the sliding context window, which imposes a need for a wide-band
// computeDistance 'radius' in the called function.
const alignmentResults = this.tokenization.computeAlignment(tokenizedContext.map((token) => token.text), isSliding, isApplyingSuggestion);
const trueInput = transformDistribution[0].sample;
const transition = new ContextTransition(this, this.appliedInput?.id);

if(alignmentResults.canAlign == false) { // Needs to be explicit for TS type inference.
if(console && console.error) {
console.error(`Could not align contexts with edit path ${JSON.stringify(alignmentResults.editPath)}`);
}
return null;
}
// From here on, we work toward the common-case - re-using old info when
// context (and its tokenization) is changed by an input Transform.

const resultTokenization = this.tokenization.transitionTo(
tokenizedContext,
alignmentResults,
lexicalModel,
transformSequenceDistribution
);
let trueInputSubsetKey: string;
const slideUpdateTransform = determineContextSlideTransform(this.context, context);

if(!resultTokenization) {
if(console && console.error) {
console.error(`Transition to alignable tokenization failed: alignment properties ${JSON.stringify(alignmentResults)}`);
}
return null;
}
// Goal: allow multiple base tokenizations.
const startTokenizations = [this.tokenization];
const startTokenizationsAfterSlide = startTokenizations.map(t => t.applyContextSlide(lexicalModel, slideUpdateTransform));

const transition = new ContextTransition(this, this.appliedInput?.id);
// Occurs on context resets & after applying suggestions/reversions
if(resultTokenization == this.tokenization) {
// Easy case - no net change to the tokenizations whatsoever; the actual request
// aims to save-state the most recent results.
//
// This behavior occurs during context resets & after applying suggestions/reversions.
if(TransformUtils.isEmpty(trueInput) && transformDistribution.length == 1) {
// If the tokenizations match, clone the ContextState; we want to preserve a post-application
// context separately from pre-application contexts for predictions based on empty roots.
const state = new ContextState(this);
state.tokenization = startTokenizationsAfterSlide[0];
transition.finalize(state, transformDistribution);
return transition;
}

const subsetBuilder = new TokenizationSubsetBuilder(legacySubsetKeyer);
for(let baseTokenization of startTokenizationsAfterSlide) {

for(let mass of transformDistribution) {
const tokenizationAnalysis = baseTokenization.mapWhitespacedTokenization(lexicalModel, mass.sample);
subsetBuilder.addPrecomputation(baseTokenization, tokenizationAnalysis, mass.p);

if(mass.sample == trueInput) {
trueInputSubsetKey = subsetBuilder.keyer(tokenizationAnalysis);
}
}
}

// And now to (partly) detransform from a multiple-tokenization paradigm.
const trueInputSubset = subsetBuilder.subsets.get(trueInputSubsetKey);
// Right now, we only have one base tokenization, so we just fetch it.
const baseTokenization = startTokenizationsAfterSlide[0];
// For multiple tokenizations, we'd retrieve each, use the "most likely" one as base,
// and then fold all resulting search spaces (on the final token) into one.
const tokenizationAnalysis = trueInputSubset.pendingSet.get(baseTokenization);

// Should gain one per subsetBuilder.subsets entry.
const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, lexicalModel, trueInput);

// ------------

// So, if we have a suggestion transition ID at the end and didn't just apply...
// we've just returned to the end of an applied suggestion's token.
//
Expand All @@ -272,28 +274,74 @@ export class ContextState {
// We expect such cases to have SOMETHING for a preservation transform here;
// we need to ensure that any suggestions for the new token believe that
// the token is starting fresh, without any prior text.
if(alignmentResults.tailTokenShift > 0) {
//
// We actually will want to build `preservationTransform`s based on the path
// leading to each correction/suggestion. But, until now, we've just built
// it based upon the actual input transform - so we'll maintain (temporarily)
// as a transitional state.

const bestResultAnalysis = tokenizationAnalysis;
// inputTransform is the ideal transform we found.

// If tokens were inserted, emit an empty transform; this prevents
// suggestions from replacing the "current" token.
const bestTokenizedInput = bestResultAnalysis.inputs[0].sample;
if(bestTokenizedInput.size > 1 || bestTokenizedInput.has(1)) {
preservationTransform = { insert: '', deleteLeft: 0 };
}

if(transformSequenceDistribution) {
const transformKeys = [...transformSequenceDistribution[0].sample.keys()];
// Leave out the final entry - that part is replaceable by suggestions.
transformKeys.pop();

for(let i of transformKeys) {
const primaryInput = transformSequenceDistribution[0].sample.get(i);
if(!preservationTransform) {
preservationTransform = primaryInput;
} else {
preservationTransform.insert += primaryInput.insert;
preservationTransform.deleteLeft += primaryInput.deleteLeft;
}
const transformKeys = [...bestResultAnalysis.inputs[0].sample.keys()];
transformKeys.pop();

for(let i of transformKeys) {
/*
* Thinking ahead to multitokenization:
*
* If what we have is not on the "true" tokenization, then... we need to
* do multitoken effects, right? We're basing new suggestions based on a
* state that does not currently exist! We'd need to enforce THAT state,
* *then* do the suggestion!
* - Which gets fun if we auto-apply such a case, as the new "true" tokenization
* no longer results directly from the true input.
*
* If we give tokens unique IDs on first creation, we could backtrace to
* find the most recent common ancestor.
* - simple cases (same 'token', but different input transform lengths/effects)
* will have the same prior token ID
*/
const primaryInput = bestResultAnalysis.inputs[0].sample.get(i);
if(!preservationTransform) {
preservationTransform = primaryInput;
} else {
preservationTransform.insert += primaryInput.insert;
preservationTransform.deleteLeft += primaryInput.deleteLeft;
}
}

const state = new ContextState(postContext, lexicalModel);
state.tokenization = resultTokenization;
const postContext = transformDistribution?.[0] ? applyTransform(transformDistribution[0].sample, context) : context;

// Note for future: the next line's pattern asserts that there is only one true tokenization.
// We may eventually allow for multiple potential tokenizations (per epic-dict-breaker)
const tokenizedContext = determineModelTokenizer(lexicalModel)(postContext).left;
if(tokenizedContext.length == 0) {
tokenizedContext.push({text: ''});
}
// In which case we could try need to align for each of them, starting from the most likely.

// If we're not at the start of the buffer, we're probably a sliding context.
const isSliding = !this.context.startOfBuffer;

// It's possible the tokenization will remember more of the initial token than is
// actually present in the sliding context window, which imposes a need for a wide-band
// computeDistance 'radius' in the called function.
const alignmentResults = this.tokenization.computeAlignment(tokenizedContext.map((token) => token.text), isSliding, isApplyingSuggestion);

// Stopgap: add tokenized transformSequenceDistribution to the alignment data & use that
// where noted: tagTokens() in context-transition.ts, `determineSuggestionAlignment()`.


const state = new ContextState(applyTransform(trueInput, context), lexicalModel);
state.tokenization = new ContextTokenization(resultTokenization.tokens, alignmentResults);
state.appliedInput = transformDistribution?.[0].sample;
transition.finalize(state, transformDistribution, preservationTransform);
transition.revertableTransitionId = appliedSuggestionTransitionId;
Expand All @@ -315,7 +363,7 @@ export class ContextState {
* @returns The substring prepended to the context (if sliding backward) or the
* number of codepoints removed from its start (if sliding forward)
*/
export function determineContextSlideTransform(srcContext: Context, dstContext: Context): Transform {
export function determineContextSlideTransform(srcContext: Context, dstContext: Context): Transform & { deleteRight: number } {
// Assumption: the current (sliding) context window is alignable.
// See `matchBaseContextState` in ../predict-helpers.ts.

Expand Down
Loading