From 713ea054a998f58cf828e35f542888ed94c1e718 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Thu, 6 Mar 2025 20:04:23 +0100 Subject: [PATCH 01/13] define types to allow building a stateful lexer --- packages/ts-parsec/src/Lexer.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 8f82ed4..6261a03 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -83,8 +83,11 @@ class TokenImpl implements Token { } } +type LexerState = [boolean, RegExp, T, LexerState | "pop"][]; +type TopLevelLexerRule = [boolean, RegExp, T, LexerState?]; + class LexerImpl implements Lexer { - constructor(public rules: [boolean, RegExp, T][]) { + constructor(public rules: TopLevelLexerRule[]) { for (const rule of this.rules) { if (rule[1].source[0] !== '^') { throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${rule[1].source}`); @@ -155,6 +158,6 @@ class LexerImpl implements Lexer { } } -export function buildLexer(rules: [boolean, RegExp, T][]): Lexer { +export function buildLexer(rules: TopLevelLexerRule[]): Lexer { return new LexerImpl(rules); } From 218a1f7ab054d03845916e433e78d326e94449c8 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Thu, 6 Mar 2025 20:26:26 +0100 Subject: [PATCH 02/13] analyze the rules in nested lexer states --- packages/ts-parsec/src/Lexer.ts | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 6261a03..fd57000 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -83,19 +83,27 @@ class TokenImpl implements Token { } } -type LexerState = [boolean, RegExp, T, LexerState | "pop"][]; +type LexerState = [boolean, RegExp, T, (LexerState | "pop")?][]; type TopLevelLexerRule = [boolean, RegExp, T, LexerState?]; +function analyzeLexerRules(rules: LexerState) { + for (const [_, regex, _, state] of rules) { + if (regex.source[0] !== "^") { + throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${regex.source}`); + } + if (!regex.global) { + throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`); + } + if (state !== undefined && state !== "pop") { + analyzeLexerRules(state); + } + } +} + class LexerImpl implements Lexer { constructor(public rules: TopLevelLexerRule[]) { - for (const rule of this.rules) { - if (rule[1].source[0] !== '^') { - throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${rule[1].source}`); - } - if (!rule[1].global) { - throw new Error(`Regular expression patterns for a tokenizer should be global: ${rule[1].source}`); - } - } + // Casting `rules` to `LexerState` is safe because `LexerState` is a superset of `TopLevelLexerRule` + analyzeLexerRules(rules as LexerState); } public parse(input: string): TokenImpl | undefined { From ccba6f6b9a3188f7dd902f6d4c0fb81f1684bd96 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Thu, 6 Mar 2025 21:01:05 +0100 Subject: [PATCH 03/13] Implement the stateful lexer --- packages/ts-parsec/src/Lexer.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index fd57000..758fe8a 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -101,6 +101,8 @@ function analyzeLexerRules(rules: LexerState) { } class LexerImpl implements Lexer { + private states: LexerState[] = [this.rules]; + constructor(public rules: TopLevelLexerRule[]) { // Casting `rules` to `LexerState` is safe because `LexerState` is a superset of `TopLevelLexerRule` analyzeLexerRules(rules as LexerState); @@ -117,7 +119,9 @@ class LexerImpl implements Lexer { const subString = input.substr(indexStart); let result: TokenImpl | undefined; - for (const [keep, regexp, kind] of this.rules) { + const currentRuleset = this.states[this.states.length - 1]; + let nextState: LexerState | "pop" | undefined = undefined; + for (const [keep, regexp, kind, next] of currentRuleset) { regexp.lastIndex = 0; if (regexp.test(subString)) { const text = subString.substr(0, regexp.lastIndex); @@ -134,6 +138,7 @@ class LexerImpl implements Lexer { const newResult = new TokenImpl(this, input, kind, text, { index: indexStart, rowBegin, columnBegin, rowEnd, columnEnd }, keep); if (result === undefined || result.text.length < newResult.text.length) { result = newResult; + nextState = next; } } } @@ -144,6 +149,11 @@ class LexerImpl implements Lexer { `Unable to tokenize the rest of the input: ${input.substr(indexStart)}` ); } else { + if (nextState === "pop") { + this.states.pop(); + } else if (nextState !== undefined) { + this.states.push(nextState); + } return result; } } From 52941079f9f2d50e477c5aa8044f3f535bd06bfa Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Thu, 6 Mar 2025 21:02:19 +0100 Subject: [PATCH 04/13] Resolve naming conflict --- packages/ts-parsec/src/Lexer.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 758fe8a..36a8ea5 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -87,7 +87,7 @@ type LexerState = [boolean, RegExp, T, (LexerState | "pop")?][]; type TopLevelLexerRule = [boolean, RegExp, T, LexerState?]; function analyzeLexerRules(rules: LexerState) { - for (const [_, regex, _, state] of rules) { + for (const [_keep, regex, _kind, state] of rules) { if (regex.source[0] !== "^") { throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${regex.source}`); } From c44c98ef875f3888adf64457093a19929494b8e9 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Thu, 6 Mar 2025 21:17:44 +0100 Subject: [PATCH 05/13] apply tslint suggestions to follow project rules --- packages/ts-parsec/src/Lexer.ts | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 36a8ea5..40688bb 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -83,18 +83,18 @@ class TokenImpl implements Token { } } -type LexerState = [boolean, RegExp, T, (LexerState | "pop")?][]; +type LexerState = [boolean, RegExp, T, (LexerState | 'pop')?][]; type TopLevelLexerRule = [boolean, RegExp, T, LexerState?]; -function analyzeLexerRules(rules: LexerState) { - for (const [_keep, regex, _kind, state] of rules) { - if (regex.source[0] !== "^") { - throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${regex.source}`); +function analyzeLexerRules(rules: LexerState): void { + for (const [, regex, , state] of rules) { + if (regex.source[0] !== '^') { + throw new Error(`Regular expression patterns for a tokenizer should start with '^': ${regex.source}`); } if (!regex.global) { throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`); } - if (state !== undefined && state !== "pop") { + if (state !== undefined && state !== 'pop') { analyzeLexerRules(state); } } @@ -105,7 +105,7 @@ class LexerImpl implements Lexer { constructor(public rules: TopLevelLexerRule[]) { // Casting `rules` to `LexerState` is safe because `LexerState` is a superset of `TopLevelLexerRule` - analyzeLexerRules(rules as LexerState); + analyzeLexerRules(rules); } public parse(input: string): TokenImpl | undefined { @@ -120,7 +120,7 @@ class LexerImpl implements Lexer { const subString = input.substr(indexStart); let result: TokenImpl | undefined; const currentRuleset = this.states[this.states.length - 1]; - let nextState: LexerState | "pop" | undefined = undefined; + let nextState: LexerState | 'pop' | undefined; for (const [keep, regexp, kind, next] of currentRuleset) { regexp.lastIndex = 0; if (regexp.test(subString)) { @@ -149,7 +149,7 @@ class LexerImpl implements Lexer { `Unable to tokenize the rest of the input: ${input.substr(indexStart)}` ); } else { - if (nextState === "pop") { + if (nextState === 'pop') { this.states.pop(); } else if (nextState !== undefined) { this.states.push(nextState); From 4c174acc66a9f04164d39bd6a8464f04997a0d68 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Fri, 7 Mar 2025 00:45:15 +0100 Subject: [PATCH 06/13] Test stateful tokenization on c-style block comments --- packages/ts-parsec/src/Lexer.ts | 4 +++ packages/tspc-test/src/TestLexer.ts | 44 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 40688bb..6cb941b 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -176,6 +176,10 @@ class LexerImpl implements Lexer { } } +export function buildLexerState(rules: LexerState): LexerState { + return rules; +} + export function buildLexer(rules: TopLevelLexerRule[]): Lexer { return new LexerImpl(rules); } diff --git a/packages/tspc-test/src/TestLexer.ts b/packages/tspc-test/src/TestLexer.ts index 59f65ef..d89dce5 100644 --- a/packages/tspc-test/src/TestLexer.ts +++ b/packages/tspc-test/src/TestLexer.ts @@ -133,3 +133,47 @@ test(`Lexer: identifiers and numbers with discardable commas and spaces`, () => assert.strictEqual(token, undefined); }); + +test(`Lexer: C-style block comments via lexer states`, () => { + enum TokenKind { + CommentBegin, + CommentEnd, + CommentContents, + Number, + Identifier, + Comma, + Space, + } + + const BlockComment = buildLexerState([ + [false, /^\*\//g, TokenKind.CommentEnd, 'pop'], + [true, /^[^*]+/g, TokenKind.CommentContents], + ]); + + const lexer = buildLexer([ + [false, /^\/\*/g, TokenKind.CommentBegin, BlockComment], + [true, /^\d+/g, TokenKind.Number], + [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier], + [false, /^,/g, TokenKind.Comma], + [false, /^\s+/g, TokenKind.Space] + ]); + + let token = lexer.parse(`123 /* abc */ def`); + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.Number); + assert.strictEqual(token.text, '123'); + token = token.next; + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.CommentContents); + assert.strictEqual(token.text, ' abc '); + token = token.next; + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.Identifier); + assert.strictEqual(token.text, 'def'); + token = token.next; + + assert.strictEqual(token, undefined); +}); From 44255c249b32acdb443231d9f3e6281e9ba4a3bc Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Fri, 7 Mar 2025 01:04:59 +0100 Subject: [PATCH 07/13] Don't differentiate between top-level rules and states on the type level --- packages/ts-parsec/src/Lexer.ts | 27 ++++++++++++++------------- packages/tspc-test/src/TestLexer.ts | 6 +++--- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 6cb941b..597b524 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -83,10 +83,10 @@ class TokenImpl implements Token { } } -type LexerState = [boolean, RegExp, T, (LexerState | 'pop')?][]; -type TopLevelLexerRule = [boolean, RegExp, T, LexerState?]; +export type LexerRule = [boolean, RegExp, T, (LexerRule[] | 'pop')?]; +export type LexerState = LexerRule[]; -function analyzeLexerRules(rules: LexerState): void { +function analyzeLexerRules(rules: LexerState, topLevel: boolean): void { for (const [, regex, , state] of rules) { if (regex.source[0] !== '^') { throw new Error(`Regular expression patterns for a tokenizer should start with '^': ${regex.source}`); @@ -94,8 +94,14 @@ function analyzeLexerRules(rules: LexerState): void { if (!regex.global) { throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`); } - if (state !== undefined && state !== 'pop') { - analyzeLexerRules(state); + if (state !== undefined) { + if (state === 'pop') { + if (topLevel) { + throw new Error(`The 'pop' directive is not allowed in the top-level lexer state`); + } + } else { + analyzeLexerRules(state, false); + } } } } @@ -103,9 +109,8 @@ function analyzeLexerRules(rules: LexerState): void { class LexerImpl implements Lexer { private states: LexerState[] = [this.rules]; - constructor(public rules: TopLevelLexerRule[]) { - // Casting `rules` to `LexerState` is safe because `LexerState` is a superset of `TopLevelLexerRule` - analyzeLexerRules(rules); + constructor(public rules: LexerState) { + analyzeLexerRules(rules, true); } public parse(input: string): TokenImpl | undefined { @@ -176,10 +181,6 @@ class LexerImpl implements Lexer { } } -export function buildLexerState(rules: LexerState): LexerState { - return rules; -} - -export function buildLexer(rules: TopLevelLexerRule[]): Lexer { +export function buildLexer(rules: LexerState): Lexer { return new LexerImpl(rules); } diff --git a/packages/tspc-test/src/TestLexer.ts b/packages/tspc-test/src/TestLexer.ts index d89dce5..672615b 100644 --- a/packages/tspc-test/src/TestLexer.ts +++ b/packages/tspc-test/src/TestLexer.ts @@ -4,7 +4,7 @@ // tslint:disable:trailing-comma import * as assert from 'assert'; -import { buildLexer } from 'typescript-parsec'; +import { buildLexer, LexerState } from 'typescript-parsec'; function notUndefined(t: T | undefined): T { assert.notStrictEqual(t, undefined); @@ -145,10 +145,10 @@ test(`Lexer: C-style block comments via lexer states`, () => { Space, } - const BlockComment = buildLexerState([ + const BlockComment: LexerState = [ [false, /^\*\//g, TokenKind.CommentEnd, 'pop'], [true, /^[^*]+/g, TokenKind.CommentContents], - ]); + ]; const lexer = buildLexer([ [false, /^\/\*/g, TokenKind.CommentBegin, BlockComment], From 429cd450ec1fb0f1ab188866259b17754342d30d Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Fri, 7 Mar 2025 13:06:48 +0100 Subject: [PATCH 08/13] Test the stateful lexer with nested block comments --- packages/tspc-test/src/TestLexer.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/tspc-test/src/TestLexer.ts b/packages/tspc-test/src/TestLexer.ts index 672615b..2c7de11 100644 --- a/packages/tspc-test/src/TestLexer.ts +++ b/packages/tspc-test/src/TestLexer.ts @@ -134,7 +134,7 @@ test(`Lexer: identifiers and numbers with discardable commas and spaces`, () => assert.strictEqual(token, undefined); }); -test(`Lexer: C-style block comments via lexer states`, () => { +test(`Lexer: C-style nested block comments via lexer states`, () => { enum TokenKind { CommentBegin, CommentEnd, @@ -146,8 +146,9 @@ test(`Lexer: C-style block comments via lexer states`, () => { } const BlockComment: LexerState = [ + [false, /^\/\*/g, TokenKind.CommentBegin, 'push'], // nested comment [false, /^\*\//g, TokenKind.CommentEnd, 'pop'], - [true, /^[^*]+/g, TokenKind.CommentContents], + [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents], ]; const lexer = buildLexer([ @@ -155,10 +156,10 @@ test(`Lexer: C-style block comments via lexer states`, () => { [true, /^\d+/g, TokenKind.Number], [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier], [false, /^,/g, TokenKind.Comma], - [false, /^\s+/g, TokenKind.Space] + [false, /^\s+/g, TokenKind.Space], ]); - let token = lexer.parse(`123 /* abc */ def`); + let token = lexer.parse(`123 /* abc /*456*/*/ def`); token = notUndefined(token); assert.strictEqual(token.kind, TokenKind.Number); @@ -170,6 +171,11 @@ test(`Lexer: C-style block comments via lexer states`, () => { assert.strictEqual(token.text, ' abc '); token = token.next; + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.CommentContents); + assert.strictEqual(token.text, '456'); + token = token.next; + token = notUndefined(token); assert.strictEqual(token.kind, TokenKind.Identifier); assert.strictEqual(token.text, 'def'); From a519209633549d775040ba434653cc0372ccf797 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Fri, 7 Mar 2025 13:07:28 +0100 Subject: [PATCH 09/13] Support pushing the current state to the stack again --- packages/ts-parsec/src/Lexer.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 597b524..1e4e7ac 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -83,7 +83,7 @@ class TokenImpl implements Token { } } -export type LexerRule = [boolean, RegExp, T, (LexerRule[] | 'pop')?]; +export type LexerRule = [boolean, RegExp, T, (LexerRule[] | 'push' | 'pop')?]; export type LexerState = LexerRule[]; function analyzeLexerRules(rules: LexerState, topLevel: boolean): void { @@ -95,9 +95,9 @@ function analyzeLexerRules(rules: LexerState, topLevel: boolean): void { throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`); } if (state !== undefined) { - if (state === 'pop') { + if (state === 'pop' || state === 'push') { if (topLevel) { - throw new Error(`The 'pop' directive is not allowed in the top-level lexer state`); + throw new Error(`The 'push' and 'pop' directives are not allowed in the top-level lexer state`); } } else { analyzeLexerRules(state, false); @@ -125,7 +125,7 @@ class LexerImpl implements Lexer { const subString = input.substr(indexStart); let result: TokenImpl | undefined; const currentRuleset = this.states[this.states.length - 1]; - let nextState: LexerState | 'pop' | undefined; + let nextState: LexerState | 'push' | 'pop' | undefined; for (const [keep, regexp, kind, next] of currentRuleset) { regexp.lastIndex = 0; if (regexp.test(subString)) { @@ -156,6 +156,8 @@ class LexerImpl implements Lexer { } else { if (nextState === 'pop') { this.states.pop(); + } else if(nextState === 'push') { + this.states.push(currentRuleset); } else if (nextState !== undefined) { this.states.push(nextState); } From 1b81b3e2baa952d69ccbe64d72750601ca456986 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Fri, 7 Mar 2025 13:09:24 +0100 Subject: [PATCH 10/13] apply tslint suggestions --- doc/Tokenizer.md | 44 +++++++++++++++++++++++++++++ packages/ts-parsec/src/Lexer.ts | 2 +- packages/tspc-test/src/TestLexer.ts | 4 +-- 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/doc/Tokenizer.md b/doc/Tokenizer.md index 4b7b5b7..8151500 100644 --- a/doc/Tokenizer.md +++ b/doc/Tokenizer.md @@ -51,3 +51,47 @@ For some languages, like VB.NET, it has a context sensitive tokenizer. You could ## NOTE `buildLexer` only accepts regular expressions like this: `/^xxx/g`. + +## Stateful tokenization + +Internally, the lexer maintains a stack of states that you can grow. A state is +defined as the set of rules that the lexer uses to tokenize the input. For +instance, in the examples shown above, `buildLexer` was used to create a lexer +with a single state with three rules each. Stateful tokenization is useful if +you want to provide different rules to the lexer based on previously matched +tokens. + +The following example shows a lexer that tokenizes nested block comments. Start +by looking at the set of top-level rules defined by `buildLexer`. These rules +look standard, except for the rule that recognizes a `TokenKind.CommentBegin`. +When a rule contains a fourth element, and the rule is matched, it means that +the lexer will switch to a different state. In this case, the fourth element +tells us that the lexer will switch to the `BlockComment` state by pushing the +state to its internal stack. The definition of a state works almost analogously +to the definition of the top-level state using `buildLexer`. When the tokenizer +switches to another state, only the rules defined inside of that state apply +until the tokenizer leaves the state again. To leave a state, the fourth element +of a rule can be set to `'pop'`, which pops the state off of the lexers' +internal stack. In case you wish to push the same state to the stack that you +are already in, use the `'push'` directive. When the fourth element of a rule is +omitted, the lexer will remain in its current state. + +```typescript +const blockComment: LexerState = [ + [false, /^\/\*/g, TokenKind.CommentBegin, "push"], // nested comment + [false, /^\*\//g, TokenKind.CommentEnd, "pop"], + [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents], +]; + +const tokenizer = buildLexer([ + [false, /^\/\*/g, TokenKind.CommentBegin, blockComment], + [true, /^\d+/g, TokenKind.Number], + [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier], + [false, /^,/g, TokenKind.Comma], + [false, /^\s+/g, TokenKind.Space], +]); +``` + +Note: Using `'push'` or `'pop'` is not allowed in the top-level state. If you +wish to switch states from there, you need to provide a concrete instance of the +new state that should be pushed. diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 1e4e7ac..b807d1f 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -156,7 +156,7 @@ class LexerImpl implements Lexer { } else { if (nextState === 'pop') { this.states.pop(); - } else if(nextState === 'push') { + } else if (nextState === 'push') { this.states.push(currentRuleset); } else if (nextState !== undefined) { this.states.push(nextState); diff --git a/packages/tspc-test/src/TestLexer.ts b/packages/tspc-test/src/TestLexer.ts index 2c7de11..653e4d9 100644 --- a/packages/tspc-test/src/TestLexer.ts +++ b/packages/tspc-test/src/TestLexer.ts @@ -145,14 +145,14 @@ test(`Lexer: C-style nested block comments via lexer states`, () => { Space, } - const BlockComment: LexerState = [ + const blockComment: LexerState = [ [false, /^\/\*/g, TokenKind.CommentBegin, 'push'], // nested comment [false, /^\*\//g, TokenKind.CommentEnd, 'pop'], [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents], ]; const lexer = buildLexer([ - [false, /^\/\*/g, TokenKind.CommentBegin, BlockComment], + [false, /^\/\*/g, TokenKind.CommentBegin, blockComment], [true, /^\d+/g, TokenKind.Number], [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier], [false, /^,/g, TokenKind.Comma], From c67989647546305d2f5e8b2ef6643776bd9f88f8 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Tue, 1 Apr 2025 16:38:30 +0200 Subject: [PATCH 11/13] safely analyze recurively dependent lexer states --- packages/ts-parsec/src/Lexer.ts | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index b807d1f..e36d10c 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -86,7 +86,12 @@ class TokenImpl implements Token { export type LexerRule = [boolean, RegExp, T, (LexerRule[] | 'push' | 'pop')?]; export type LexerState = LexerRule[]; -function analyzeLexerRules(rules: LexerState, topLevel: boolean): void { +function analyzeLexerRules( + rules: LexerState, + topLevel: boolean, + memo: Set> = new Set(), +): void { + memo.add(rules); for (const [, regex, , state] of rules) { if (regex.source[0] !== '^') { throw new Error(`Regular expression patterns for a tokenizer should start with '^': ${regex.source}`); @@ -100,7 +105,10 @@ function analyzeLexerRules(rules: LexerState, topLevel: boolean): void { throw new Error(`The 'push' and 'pop' directives are not allowed in the top-level lexer state`); } } else { - analyzeLexerRules(state, false); + if (memo.has(state)) { + return; + } + analyzeLexerRules(state, false, memo); } } } @@ -186,3 +194,18 @@ class LexerImpl implements Lexer { export function buildLexer(rules: LexerState): Lexer { return new LexerImpl(rules); } + +// TESTING + +const statements: LexerState = []; +const stringLiteral: LexerState = []; + +statements.push( + [true, /^"/g, "stringDelimiter", stringLiteral], +); + +stringLiteral.push( + [true, /^\${/g, "stringInterpolationDelimiter", statements], +); + +buildLexer(statements); From 23e8f781c2c26c2ca2ea2a4466b502f8117b046a Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Tue, 1 Apr 2025 16:56:51 +0200 Subject: [PATCH 12/13] allow push and pop directives in the top-level state --- packages/ts-parsec/src/Lexer.ts | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index e36d10c..b9decee 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -88,7 +88,6 @@ export type LexerState = LexerRule[]; function analyzeLexerRules( rules: LexerState, - topLevel: boolean, memo: Set> = new Set(), ): void { memo.add(rules); @@ -100,15 +99,11 @@ function analyzeLexerRules( throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`); } if (state !== undefined) { - if (state === 'pop' || state === 'push') { - if (topLevel) { - throw new Error(`The 'push' and 'pop' directives are not allowed in the top-level lexer state`); - } - } else { + if (state !== 'pop' && state !== 'push') { if (memo.has(state)) { return; } - analyzeLexerRules(state, false, memo); + analyzeLexerRules(state, memo); } } } @@ -118,7 +113,7 @@ class LexerImpl implements Lexer { private states: LexerState[] = [this.rules]; constructor(public rules: LexerState) { - analyzeLexerRules(rules, true); + analyzeLexerRules(rules); } public parse(input: string): TokenImpl | undefined { From 6a4c0841096d3cc9454a1dae155d2928f153b210 Mon Sep 17 00:00:00 2001 From: Leon Puchinger Date: Wed, 2 Apr 2025 23:39:23 +0200 Subject: [PATCH 13/13] update documentation --- doc/Tokenizer.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/Tokenizer.md b/doc/Tokenizer.md index 8151500..d8022d6 100644 --- a/doc/Tokenizer.md +++ b/doc/Tokenizer.md @@ -91,7 +91,3 @@ const tokenizer = buildLexer([ [false, /^\s+/g, TokenKind.Space], ]); ``` - -Note: Using `'push'` or `'pop'` is not allowed in the top-level state. If you -wish to switch states from there, you need to provide a concrete instance of the -new state that should be pushed.