diff --git a/crates/ast/ast.json b/crates/ast/ast.json index e02d91648..14157f505 100644 --- a/crates/ast/ast.json +++ b/crates/ast/ast.json @@ -2,6 +2,9 @@ "Void": { "_type": "enum" }, + "Asi": { + "_type": "struct" + }, "Argument": { "_type": "enum", "SpreadElement": "Box", diff --git a/crates/ast/src/source_location.rs b/crates/ast/src/source_location.rs index 2219fa451..78fdb2e23 100644 --- a/crates/ast/src/source_location.rs +++ b/crates/ast/src/source_location.rs @@ -25,8 +25,10 @@ impl SourceLocation { self.start = start.start; self.end = end.end; } +} - pub fn default() -> Self { +impl Default for SourceLocation { + fn default() -> Self { Self { start: 0, end: 0 } } } diff --git a/crates/generated_parser/src/ast_builder.rs b/crates/generated_parser/src/ast_builder.rs index ca134da86..5d8216516 100644 --- a/crates/generated_parser/src/ast_builder.rs +++ b/crates/generated_parser/src/ast_builder.rs @@ -84,6 +84,11 @@ impl<'alloc> AstBuilder<'alloc> { list.append(elements); } + // Automatic Semicolon Insertion + pub fn asi(&self) -> arena::Box<'alloc, Asi> { + self.alloc_with(|| Asi::default()) + } + // IdentifierReference : Identifier pub fn identifier_reference( &self, diff --git a/crates/generated_parser/src/traits/mod.rs b/crates/generated_parser/src/traits/mod.rs index 69e2c7dae..4c2386ec5 100644 --- a/crates/generated_parser/src/traits/mod.rs +++ b/crates/generated_parser/src/traits/mod.rs @@ -32,5 +32,5 @@ pub trait ParserTrait<'alloc, Value> { fn replay(&mut self, tv: TermValue); fn epsilon(&mut self, state: usize); fn top_state(&self) -> usize; - fn check_not_on_new_line(&mut self, peek: usize) -> Result<'alloc, bool>; + fn is_on_new_line(&self) -> Result<'alloc, bool>; } diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index c9f12a56e..876321266 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -96,19 +96,13 @@ impl<'alloc> ParserTrait<'alloc, StackValue<'alloc>> for Parser<'alloc> { fn top_state(&self) -> usize { self.state() } - fn check_not_on_new_line(&mut self, peek: usize) -> Result<'alloc, bool> { + fn is_on_new_line(&self) -> Result<'alloc, bool> { let sv = { let stack = self.node_stack.stack_slice(); - &stack[stack.len() - peek].value + &stack[stack.len() - 1].value }; if let StackValue::Token(ref token) = sv { - if !token.is_on_new_line { - return Ok(true); - } - self.rewind(peek - 1); - let tv = self.pop(); - self.try_error_handling(tv)?; - return Ok(false); + return Ok(token.is_on_new_line); } Err(ParseError::NoLineTerminatorHereExpectedToken.into()) } @@ -194,51 +188,14 @@ impl<'alloc> Parser<'alloc> { }), }); if let StackValue::Token(ref token) = t.value { - // Error tokens might them-self cause more errors to be reported. - // This happens due to the fact that the ErrorToken can be replayed, - // and while the ErrorToken might be in the lookahead rules, it - // might not be in the shifted terms coming after the reduced - // nonterminal. - if t.term == TerminalId::ErrorToken.into() { - return Err(Self::parse_error(token).into()); - } - - // Otherwise, check if the current rule accept an Automatic - // Semi-Colon insertion (ASI). - let state = self.state(); - assert!(state < TABLES.shift_count); - let error_code = TABLES.error_codes[state]; - if let Some(error_code) = error_code { - let err_token = (*token).clone(); - Self::recover(token, error_code)?; - self.replay(t); - let err_token = self.handler.alloc(err_token); - self.replay(TermValue { - term: TerminalId::ErrorToken.into(), - value: StackValue::Token(err_token), - }); - return Ok(false); - } // On error, don't attempt error handling again. return Err(Self::parse_error(token).into()); } Err(ParseError::ParserCannotUnpackToken.into()) } - pub(crate) fn recover(t: &Token, error_code: ErrorCode) -> Result<'alloc, ()> { - match error_code { - ErrorCode::Asi => { - if t.is_on_new_line - || t.terminal_id == TerminalId::End - || t.terminal_id == TerminalId::CloseBrace - { - Ok(()) - } else { - Err(Self::parse_error(t).into()) - } - } - ErrorCode::DoWhileAsi => Ok(()), - } + pub(crate) fn recover(t: &Token, _error_code: ErrorCode) -> Result<'alloc, ()> { + Err(Self::parse_error(t).into()) } fn simulator<'a>(&'a self) -> Simulator<'alloc, 'a> { diff --git a/crates/parser/src/simulator.rs b/crates/parser/src/simulator.rs index 96773e1d6..4d828149a 100644 --- a/crates/parser/src/simulator.rs +++ b/crates/parser/src/simulator.rs @@ -114,8 +114,8 @@ impl<'alloc, 'parser> ParserTrait<'alloc, ()> for Simulator<'alloc, 'parser> { fn top_state(&self) -> usize { self.state() } - fn check_not_on_new_line(&mut self, _peek: usize) -> Result<'alloc, bool> { - Ok(true) + fn is_on_new_line(&self) -> Result<'alloc, bool> { + Ok(false) } } diff --git a/jsparagus/actions.py b/jsparagus/actions.py index 29d810edb..d73f8f591 100644 --- a/jsparagus/actions.py +++ b/jsparagus/actions.py @@ -96,7 +96,7 @@ def __init__(self) -> None: def is_inconsistent(self) -> bool: """Returns True if this action is inconsistent. An action can be inconsistent if the parameters it is given cannot be evaluated given - its current location in the parse table. Such as CheckNotOnNewLine. + its current location in the parse table. Such as CheckLineTerminator. """ return False @@ -108,6 +108,18 @@ def condition(self) -> Action: "Return the conditional action." raise TypeError("Action.condition not implemented") + def can_negate(self) -> bool: + "Whether the current condition (action) implemented the negate function." + assert self.is_condition() + return False + + def negate(self, covered: typing.List[Action]) -> typing.List[Action]: + """Given a list of conditions, returns the condition which check the same + variable but all the values which are not covered by the rest of the + conditions, by adding it to the list which is returned.""" + assert self.can_negate() + raise TypeError("Action.negate not implemented") + def check_same_variable(self, other: Action) -> bool: "Return whether both conditionals are checking the same variable." assert self.is_condition() @@ -214,6 +226,37 @@ def stable_str(self, states: typing.Any) -> str: ShiftedAction = typing.Union[Action, bool] +class Shift(Action): + """Shift action is the implicit action performed when a terminal or nonterminal + is used on an edge. However this state is not supported as an epsilon edge, + but only serves to annotate delayed actions of states. """ + __slots__ = ['term'] + + term: ShiftedTerm + + def __init__(self, term: ShiftedTerm): + super().__init__() + self.term = term + + def is_inconsistent(self) -> bool: + return True + + def is_condition(self) -> bool: + return True + + def condition(self) -> Shift: + return self + + def update_stack(self) -> bool: + return True + + def update_stack_with(self) -> StackDiff: + return StackDiff(0, None, -1) + + def __str__(self) -> str: + return "Shift({})".format(str(self.term)) + + class Replay(Action): """Replay a term which was previously saved by the Unwind function. Note that this does not Shift a term given as argument as the replay action should @@ -279,11 +322,14 @@ def shifted_action(self, shifted_term: Element) -> Unwind: class Reduce(Action): """Prevent the fall-through to the epsilon transition and returns to the shift table execution to resume shifting or replaying terms.""" - __slots__ = ['unwind'] + __slots__ = ['unwind', 'lookahead'] unwind: Unwind - def __init__(self, unwind: Unwind) -> None: + # List of lookahead tokens used to prevent aliasing of reduce states. + lookahead: typing.Tuple[Element, ...] + + def __init__(self, unwind: Unwind, lookahead: typing.Tuple[Element, ...] = ()) -> None: nt_name = unwind.nt.name if isinstance(nt_name, InitNt): name = "Start_" + str(nt_name.goal.name) @@ -291,9 +337,10 @@ def __init__(self, unwind: Unwind) -> None: name = nt_name super().__init__() self.unwind = unwind + self.lookahead = lookahead def __str__(self) -> str: - return "Reduce({})".format(str(self.unwind)) + return "Reduce({}, {})".format(str(self.unwind), str(self.lookahead)) def follow_edge(self) -> bool: return False @@ -306,11 +353,11 @@ def update_stack_with(self) -> StackDiff: def unshift_action(self, num: int) -> Reduce: unwind = self.unwind.unshift_action(num) - return Reduce(unwind) + return Reduce(unwind, lookahead=self.lookahead[:-num]) def shifted_action(self, shifted_term: Element) -> Reduce: unwind = self.unwind.shifted_action(shifted_term) - return Reduce(unwind) + return Reduce(unwind, lookahead=(*self.lookahead, shifted_term)) class Accept(Action): @@ -374,18 +421,27 @@ def shifted_action(self, shifted_term: Element) -> ShiftedAction: return not self.accept -class CheckNotOnNewLine(Action): +class CheckLineTerminator(Action): """Check whether the terminal at the given stack offset is on a new line or - not. If not this would produce an Error, otherwise this rule would be - shifted.""" - __slots__ = ['offset'] + not. If the condition is true, then the edge is followed. """ + __slots__ = ['offset', 'is_on_new_line'] + # Offset of the token which is being checked. + # - If this number is zero, then # this represent the next token. + # - If this number is -1, this represents the last shifted token. + # - If this number is -2, this represents the second to last shifted token. offset: int - def __init__(self, offset: int = 0) -> None: + # Check whether the token at the offset is (= True), or is not (= False) on + # a new line compared to the previous token. + is_on_new_line: bool + + def __init__(self, offset: int = 0, is_on_new_line: bool = False) -> None: # assert offset >= -1 and "Smaller offsets are not supported on all backends." super().__init__() + assert offset >= -1 self.offset = offset + self.is_on_new_line = is_on_new_line def is_inconsistent(self) -> bool: # We can only look at stacked terminals. Having an offset of 0 implies @@ -397,22 +453,35 @@ def is_inconsistent(self) -> bool: def is_condition(self) -> bool: return True - def condition(self) -> CheckNotOnNewLine: + def condition(self) -> CheckLineTerminator: return self + def can_negate(self) -> bool: + "Unordered condition, which accept or not to reach the next state." + return True + + def negate(self, covered: typing.List[Action]) -> typing.List[Action]: + assert len(covered) >= 1 and len(covered) <= 2 + if len(covered) == 2: + return covered + assert covered[0] == self + negated = CheckLineTerminator(self.offset, not self.is_on_new_line) + return [self, negated] + def check_same_variable(self, other: Action) -> bool: - return isinstance(other, CheckNotOnNewLine) and self.offset == other.offset + return isinstance(other, CheckLineTerminator) and self.offset == other.offset def check_different_values(self, other: Action) -> bool: - return False + assert isinstance(other, CheckLineTerminator) + return self.is_on_new_line != other.is_on_new_line def shifted_action(self, shifted_term: Element) -> ShiftedAction: if isinstance(shifted_term, Nt): return True - return CheckNotOnNewLine(self.offset - 1) + return CheckLineTerminator(self.offset - 1, self.is_on_new_line) def __str__(self) -> str: - return "CheckNotOnNewLine({})".format(self.offset) + return "CheckLineTerminator({}, {})".format(self.offset, self.is_on_new_line) class FilterStates(Action): @@ -554,7 +623,7 @@ def __init__( args: typing.Tuple[OutputExpr, ...], trait: types.Type = types.Type("AstBuilder"), fallible: bool = False, - set_to: str = "val", + set_to: str = "value", offset: int = 0, ) -> None: super().__init__() diff --git a/jsparagus/aps.py b/jsparagus/aps.py index 18bb7f84a..5e1a2a3e7 100644 --- a/jsparagus/aps.py +++ b/jsparagus/aps.py @@ -217,6 +217,7 @@ def shift_next(self, pt: ParseTable) -> typing.Iterator[APS]: last_edge = sh[-1] state = pt.states[last_edge.src] state_match_shift_end = self.state == self.shift[-1].src + term: Term if self.replay == []: assert state_match_shift_end for term, to in state.shifted_edges(): diff --git a/jsparagus/emit/python.py b/jsparagus/emit/python.py index c974d371c..d3ff7d770 100644 --- a/jsparagus/emit/python.py +++ b/jsparagus/emit/python.py @@ -6,7 +6,7 @@ import typing from ..grammar import ErrorSymbol, Nt, Some -from ..actions import (Accept, Action, CheckNotOnNewLine, FilterFlag, FilterStates, FunCall, +from ..actions import (Accept, Action, CheckLineTerminator, FilterFlag, FilterStates, FunCall, Lookahead, OutputExpr, PopFlag, PushFlag, Reduce, Replay, Seq, Unwind) from ..runtime import ErrorToken, ErrorTokenClass from ..ordered import OrderedSet @@ -73,7 +73,7 @@ def write_action(act: Action, indent: str = "") -> typing.Tuple[str, bool]: return indent, False if isinstance(act, Lookahead): raise ValueError("Unexpected Lookahead action") - if isinstance(act, CheckNotOnNewLine): + if isinstance(act, CheckLineTerminator): out.write("{}if not parser.check_not_on_new_line(lexer, {}):\n".format(indent, -act.offset)) out.write("{} return\n".format(indent)) return indent, True diff --git a/jsparagus/emit/rust.py b/jsparagus/emit/rust.py index ca8382954..a1945e816 100644 --- a/jsparagus/emit/rust.py +++ b/jsparagus/emit/rust.py @@ -12,7 +12,7 @@ from ..ordered import OrderedSet from ..grammar import (Some, Nt, InitNt, End, ErrorSymbol) -from ..actions import (Accept, Action, Replay, Unwind, Reduce, CheckNotOnNewLine, FilterStates, +from ..actions import (Accept, Action, Replay, Unwind, Reduce, CheckLineTerminator, FilterStates, PushFlag, PopFlag, FunCall, Seq) from .. import types @@ -223,23 +223,43 @@ def write_condition(self, state, first_act): # states. Thus we use the first action to produce the match statement. assert isinstance(first_act, Action) assert first_act.is_condition() - if isinstance(first_act, CheckNotOnNewLine): + if isinstance(first_act, CheckLineTerminator): # TODO: At the moment this is Action is implemented as a single # operation with a single destination. However, we should implement # it in the future as 2 branches, one which is verifying the lack # of new lines, and one which is shifting an extra error token. # This might help remove the overhead of backtracking in addition # to make this backtracking visible through APS. - assert len(list(state.edges())) == 1 - act, dest = next(state.edges()) assert len(self.replay_args) == 0 - assert -act.offset > 0 - self.write("// {}", str(act)) - self.write("if !parser.check_not_on_new_line({})? {{", -act.offset) - with indent(self): - self.write("return Ok(false);") - self.write("}") - self.write_epsilon_transition(dest) + assert first_act.offset == -1 + if len(state.epsilon) == 1: + # Generate if-not-then-error code. + act, dest = next(state.edges()) + test = "!" if act.is_on_new_line else "" + self.write("// {}", str(act)) + self.write("if {}parser.is_on_new_line()? {{", test) + with indent(self): + self.write("return Ok(false);") + self.write("}") + self.write_epsilon_transition(dest) + else: + # Generate if-else code + assert len(list(state.edges())) == 2 + edges = state.edges() + true_act, true_dest = next(edges) + if not true_act.is_on_new_line: + false_act, false_dest = true_act, true_dest + true_act, true_dest = next(edges) + else: + false_act, false_dest = next(edges) + self.write("// {} & {}", str(true_act), str(false_act)) + self.write("if parser.is_on_new_line()? {") + with indent(self): + self.write_epsilon_transition(true_dest) + self.write("} else {") + with indent(self): + self.write_epsilon_transition(false_dest) + self.write("}") elif isinstance(first_act, FilterStates): if len(state.epsilon) == 1: # This is an attempt to avoid huge unending compilations. diff --git a/jsparagus/grammar.py b/jsparagus/grammar.py index bcaf5a02a..94db558b0 100644 --- a/jsparagus/grammar.py +++ b/jsparagus/grammar.py @@ -1145,7 +1145,7 @@ class End: @dataclass(frozen=True) class ErrorSymbol: """Special grammar symbol that can be consumed to handle a syntax error.""" - error_code: int + error_code: str Element = typing.Union[ diff --git a/jsparagus/lr0.py b/jsparagus/lr0.py index 5e4364df3..8af81bd41 100644 --- a/jsparagus/lr0.py +++ b/jsparagus/lr0.py @@ -11,7 +11,7 @@ from dataclasses import dataclass import typing -from .actions import (Accept, Action, CheckNotOnNewLine, FunCall, Lookahead, +from .actions import (Accept, Action, CheckLineTerminator, FunCall, Lookahead, OutputExpr, Unwind, Reduce, Seq) from .ordered import OrderedFrozenSet from .grammar import (CallMethod, Element, End, ErrorSymbol, Grammar, @@ -140,7 +140,7 @@ class LRItem: # A Term is the label on an edge from one state to another. It's normally a # terminal, nonterminal, or epsilon action. A state can also have a special # catchall edge, labeled with an ErrorSymbol. -ShiftedTerm = typing.Union[str, Nt, ErrorSymbol] +ShiftedTerm = typing.Union[str, End, Nt, ErrorSymbol] Term = typing.Union[ShiftedTerm, Action] @@ -336,7 +336,7 @@ def item_transitions( # Check whether the following terminal is on a new line. If # not, this would produce a syntax error. The argument is the # terminal offset. - term = CheckNotOnNewLine() + term = CheckLineTerminator() elif isinstance(term, CallMethod): funcalls: typing.List[Action] = [] pop = sum(1 for e in prod.rhs[:lr_item.offset] if on_stack(self.grammar.grammar, e)) diff --git a/jsparagus/parse_table.py b/jsparagus/parse_table.py index ebf548c86..772e237f6 100644 --- a/jsparagus/parse_table.py +++ b/jsparagus/parse_table.py @@ -11,7 +11,8 @@ from . import types from .utils import consume, keep_until, split, default_id_dict, default_fwd_dict from .ordered import OrderedSet, OrderedFrozenSet -from .actions import Action, Replay, Reduce, FilterStates, Seq +from .actions import (Action, Shift, Replay, Reduce, Unwind, FilterStates, Seq, + FunCall, CheckLineTerminator) from .grammar import End, ErrorSymbol, InitNt, Nt from .rewrites import CanonicalGrammar from .lr0 import LR0Generator, Term @@ -56,7 +57,7 @@ class StateAndTransitions: arguments: int # Outgoing edges taken when shifting terminals. - terminals: typing.Dict[str, StateId] + terminals: typing.Dict[typing.Union[str, End], StateId] # Outgoing edges taken when shifting nonterminals after reducing. nonterminals: typing.Dict[Nt, StateId] @@ -153,7 +154,7 @@ def is_inconsistent(self) -> bool: return False def shifted_edges(self) -> typing.Iterator[ - typing.Tuple[typing.Union[str, Nt, ErrorSymbol], StateId] + typing.Tuple[typing.Union[str, End, Nt, ErrorSymbol], StateId] ]: k: Term s: StateId @@ -364,6 +365,13 @@ def __init__( self.exec_modes = grammar.grammar.exec_modes self.assume_inconsistent = True self.create_lr0_table(grammar, verbose, progress) + # Automatic Semicolon Insertion (ASI) is a process by which the + # JavaScript grammar is implicitly disambiguated. This phase would make + # this grammar transformation explicit. + self.expand_javascript_asi(verbose, progress) + # LR0 is incosnistent, which implies that we need a non-determinisitic + # evaluator. This phase solves ambiguities of the grammar by adding + # more lookahead and context. self.fix_inconsistent_table(verbose, progress) # TODO: Optimize chains of actions into sequences. # Optimize by removing unused states. @@ -587,6 +595,77 @@ def clear_edges( for dest in old_dest: self.assert_state_invariants(dest) + def restore_edges( + self, + state: StateAndTransitions, + shift_map: typing.DefaultDict[ + Term, + typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] + ], + maybe_unreachable_set: OrderedSet[StateId], + debug_depth: str = "" + ) -> None: + """Restore the new state machine based on a given state to use as a base and + the shift_map corresponding to edges.""" + + # print("{}starting with {}\n".format(depth, state)) + edges = {} + for term, actions_list in shift_map.items(): + # print("{}term: {}, lists: {}\n".format(depth, repr(term), repr(actions_list))) + # Collect all the states reachable after shifting the term. + # Compute the unique name, based on the locations and actions + # which are delayed. + locations: OrderedSet[str] = OrderedSet() + delayed: OrderedSet[DelayedAction] = OrderedSet() + new_shift_map: typing.DefaultDict[ + Term, + typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] + ] + new_shift_map = collections.defaultdict(lambda: []) + recurse = False + if not self.term_is_shifted(term): + # There is no more target after a reduce action. + actions_list = [] + for target, remaining_edges in actions_list: + assert isinstance(target, StateAndTransitions) + locations |= target.locations + delayed |= target.delayed_actions + if remaining_edges != []: + # Pull edges, with delayed actions. + for edge in remaining_edges: + edge_term = edge.term + assert edge_term is not None + if isinstance(edge_term, Action): + delayed.add(edge_term) + else: + delayed.add(Shift(edge_term)) + edge = remaining_edges[0] + edge_term = edge.term + assert edge_term is not None + new_shift_map[edge_term].append((target, remaining_edges[1:])) + recurse = True + else: + # Pull edges, as a copy of existing edges. + for next_term, next_dest_id in target.edges(): + next_dest = self.states[next_dest_id] + new_shift_map[next_term].append((next_dest, [])) + + is_new, new_target = self.new_state( + OrderedFrozenSet(locations), OrderedFrozenSet(delayed)) + edges[term] = new_target.index + if self.debug_info: + print("{}is_new = {}, index = {}".format(debug_depth, is_new, new_target.index)) + print("{}Add: {} -- {} --> {}".format(debug_depth, state.index, str(term), new_target.index)) + print("{}continue: (is_new: {}) or (recurse: {})".format(debug_depth, is_new, recurse)) + if is_new or recurse: + self.restore_edges(new_target, new_shift_map, maybe_unreachable_set, debug_depth + " ") + + self.clear_edges(state, maybe_unreachable_set) + for term, target_id in edges.items(): + self.add_edge(state, term, target_id) + if self.debug_info: + print("{}replaced by {}\n".format(debug_depth, state)) + def assert_table_invariants(self) -> None: for s in self.states: if s is not None: @@ -632,6 +711,28 @@ def remove_unreachable_states( self.remove_state(s, next_set) maybe_unreachable_set = next_set + def mark_sweep_states(self) -> None: + marked = set() + + def mark(s: StateId) -> None: + marked.add(s) + for _, dest in self.states[s].edges(): + if dest not in marked: + mark(dest) + + for _, idx in self.named_goals: + mark(idx) + + maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() + remove_list = [] + for s in self.states: + if s is not None and s.index not in marked: + remove_list.append(s) + for s in remove_list: + self.clear_edges(s, maybe_unreachable_set) + for s in remove_list: + self.remove_state(s.index, maybe_unreachable_set) + def is_reachable_state(self, s: StateId) -> bool: """Check whether the current state is reachable or not.""" if self.states[s] is None: @@ -915,6 +1016,139 @@ def visit(aps: APS) -> bool: self.aps_visitor(APS.start(state), visit) return record + def expand_javascript_asi(self, verbose: bool, progress: bool) -> None: + """The frontend of the JavaScript grammar adds ErrorSymbols to handle Automatic + Semicolon Insertion (ASI). As described in the ECMAScript + Specification [1], these locations are used as a fallback mechanism for + tokens which would not be allowed by the grammar otherwise. + + Implementing these rules implies that we have to look at all locations + which have an ErrorSymbol dedicated to ASI, and add the missing + transitions, such that the ErrorSymbol can be converted to an ordinary + nonterminal of the grammar. + + Note, this is an optimization dedicated to remove the handling of a + replay-list (variable list of lookahead) as well as a fallback + mechanism for the runtime of the parser. + + [1] https://tc39.es/ecma262/#sec-automatic-semicolon-insertion + + """ + if verbose or progress: + print("Expand JavaScript Automatic Semicolon Insertion") + + # Collect all states which have an ErrorSymbol. + todo = [] + for s in self.states: + if len(s.errors) > 0: + todo.append(s) + if todo == []: + return + + # Get the dead-end state, as the destination of the reduce edges. + dead_end = self.get_state(OrderedFrozenSet()) + + # Add Reduce ErrorSymbol("asi") + asi_nt = Nt("ASI") + asi_term = Seq([FunCall("asi", ()), Reduce(Unwind(asi_nt, 0, 0))]) + self.nonterminals.append(asi_nt) + reduce_asi: typing.Dict[ShiftedTerm, StateAndTransitions] = {} + for t in self.terminals: + reduce_asi[t] = self.get_state( + OrderedFrozenSet(['ASI ::= {} ·'.format(str(t)), + 'ASI ::= [LineTerminator here] {} ·'.format(str(t))])) + self.add_edge(reduce_asi[t], asi_term.shifted_action(t), dead_end.index) + + # Add CheckLineTerminator -> Reduce ErrorSymbol("asi") + newline_asi_term = CheckLineTerminator(-1, True) + newline_asi: typing.Dict[ShiftedTerm, StateAndTransitions] = {} + for t in self.terminals: + newline_asi[t] = self.get_state( + OrderedFrozenSet(['ASI ::= [LineTerminator here] {} ·'.format(str(t))]), + OrderedFrozenSet([newline_asi_term])) + self.add_edge(newline_asi[t], newline_asi_term, reduce_asi[t].index) + + maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() + no_LineTerminator_here = CheckLineTerminator(0, False) + + def contains_no_LineTerminator_here(aps: APS) -> bool: + for edge in aps.history: + if edge.term == no_LineTerminator_here: + return True + return False + + # Replace error symbols edges of the parse table by the semantic + # mentioned in [1] by adding the equivalent parse table rules. The + # error symbols are replaced by non-terminals which are reduced by the + # `reduce_asi` and `reduce_dw_asi` states. + # + # [1] https://tc39.es/ecma262/#sec-automatic-semicolon-insertion + def visit_error_symbols() -> typing.Iterator[None]: + for s in todo: + yield # progress bar. + assert len(s.errors) == 1 + error, error_dest = next(iter(s.errors.items())) + + # 11.9.1.1 Capture the list of offending tokens. An offending + # token is a token which is not allowed by any production of + # the grammar. + # + # 11.9.1.2 (End is implicitly considered as a terminal) + # + # Note: Normally offending tokens are any tokens which does not + # satisfy any production of the grammar, however doing so by + # using the list of all possible tokens causes an ambiguity in + # the parse table. Thus we restrict this to the list of tokens + # which might be accepted after the error symbol. + aps_lanes = self.lookahead_lanes(s.index) + aps_lanes_on_error = self.lookahead_lanes(error_dest) + assert all(len(aps.lookahead) >= 1 for aps in aps_lanes) + accepted_tokens = set(aps.lookahead[0] for aps in aps_lanes) + accepted_tokens_on_error = set(aps.lookahead[0] for aps in aps_lanes_on_error) + offending_tokens = [ + t for t in accepted_tokens_on_error + if t not in accepted_tokens and t in self.terminals + ] + + # 11.9.1.3 A restricted token is a token from a restricted + # production, i-e. which has a `[no LineTerminator here]` + # before the token. + restricted_lanes = [aps for aps in aps_lanes if contains_no_LineTerminator_here(aps)] + restricted_tokens = [aps.lookahead[0] for aps in restricted_lanes] + + # Get the index of the state to go to after the nonterminal + # semicolon. + error, dest = next(iter(s.errors.items())) + self.remove_edge(s, error, maybe_unreachable_set) + self.add_edge(s, asi_nt, dest) + + if "}" in offending_tokens: + self.add_edge(s, "}", reduce_asi["}"].index) + offending_tokens.remove("}") + + if error == ErrorSymbol("asi"): + offending_target = newline_asi + elif error == ErrorSymbol("do_while_asi"): + # `do{}while(false) foo()` is a valid JavaScript syntax, + # where a semicolon is automatically inserted between the + # ending parenthesis of the while statement and the foo + # identifier. + offending_target = reduce_asi + else: + raise ValueError("Unexpected ErrorSymbol code") + + for token in offending_tokens: + self.add_edge(s, token, offending_target[token].index) + for term in restricted_tokens: + if isinstance(term, (str, End)): + self.add_edge(s, term, newline_asi[token].index) + + consume(visit_error_symbols(), progress) + + if verbose: + print("Expand JavaScript Automatic Semicolon Insertion result:") + self.debug_dump() + def fix_with_context(self, s: StateId, aps_lanes: typing.List[APS]) -> None: raise ValueError("fix_with_context: Not Implemented") # # This strategy is about using context information. By using chains of @@ -1126,6 +1360,56 @@ def fix_with_context(self, s: StateId, aps_lanes: typing.List[APS]) -> None: # self.remove_unreachable_states(maybe_unreachable_set) # pass + def try_fix_with_conditions(self, s: StateId) -> bool: + """When dealing with consistent conditions, we can simply move all the + inconsistencies under the condition and its negated counter-part. This + is equivalent to changing the order in which variable are compared, + given condition which are not dependent on each others. """ + + state = self.states[s] + assert state.is_inconsistent() + conditions: typing.List[Action] = [] + for act, dest in state.epsilon: + if act.is_condition() and not act.is_inconsistent() and act.can_negate(): + if conditions == []: + conditions.append(act) + elif conditions[0].check_same_variable(act): + conditions.append(act) + + # If we have not found any consistent condition, then fallback on using + # other way of solving inconsistencies. + if conditions == []: + return False + + # The set of conditions extracted would remain inconsistent even after + # fix_with_conditions. + pairs = itertools.combinations(conditions, 2) + if any(not k1.check_different_values(k2) for k1, k2 in pairs): + return False + + # If the list of condition does not yet cover the space of possible + # value, add the missing actions, with which we should wrap all the + # cases which have no conditionals. + conditions = conditions[0].negate(conditions) + + shift_map: typing.DefaultDict[ + Term, + typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] + ] + shift_map = collections.defaultdict(lambda: []) + edges = list(iter(state.edges())) + for term, dest in edges: + if term in conditions: + shift_map[term].append((self.states[dest], [])) + else: + for cond in conditions: + shift_map[cond].append((self.states[dest], [Edge(s, term)])) + + maybe_unreachable_set: OrderedSet[StateId] = OrderedSet() + self.restore_edges(state, shift_map, maybe_unreachable_set) + self.remove_unreachable_states(maybe_unreachable_set) + return True + def fix_with_lookahead(self, s: StateId, aps_lanes: typing.List[APS]) -> None: # Find the list of terminals following each actions (even reduce # actions). @@ -1181,74 +1465,8 @@ def fix_with_lookahead(self, s: StateId, aps_lanes: typing.List[APS]) -> None: target = self.states[target_id] shift_map[term].append((target, new_actions)) - # Restore the new state machine based on a given state to use as a base - # and the shift_map corresponding to edges. - def restore_edges( - state: StateAndTransitions, - shift_map: typing.DefaultDict[ - Term, - typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] - ], - depth: str - ) -> None: - # print("{}starting with {}\n".format(depth, state)) - edges = {} - for term, actions_list in shift_map.items(): - # print("{}term: {}, lists: {}\n".format(depth, repr(term), repr(actions_list))) - # Collect all the states reachable after shifting the term. - # Compute the unique name, based on the locations and actions - # which are delayed. - locations: OrderedSet[str] = OrderedSet() - delayed: OrderedSet[DelayedAction] = OrderedSet() - new_shift_map: typing.DefaultDict[ - Term, - typing.List[typing.Tuple[StateAndTransitions, typing.List[Edge]]] - ] - new_shift_map = collections.defaultdict(lambda: []) - recurse = False - if not self.term_is_shifted(term): - # There is no more target after a reduce action. - actions_list = [] - for target, actions in actions_list: - assert isinstance(target, StateAndTransitions) - locations |= target.locations - delayed |= target.delayed_actions - if actions != []: - # Pull edges, with delayed actions. - edge = actions[0] - assert isinstance(edge, Edge) - for action in actions: - action_term = action.term - assert isinstance(action_term, Action) - delayed.add(action_term) - edge_term = edge.term - assert edge_term is not None - new_shift_map[edge_term].append((target, actions[1:])) - recurse = True - else: - # Pull edges, as a copy of existing edges. - for next_term, next_dest_id in target.edges(): - next_dest = self.states[next_dest_id] - new_shift_map[next_term].append((next_dest, [])) - - is_new, new_target = self.new_state( - OrderedFrozenSet(locations), OrderedFrozenSet(delayed)) - edges[term] = new_target.index - if self.debug_info: - print("{}is_new = {}, index = {}".format(depth, is_new, new_target.index)) - print("{}Add: {} -- {} --> {}".format(depth, state.index, str(term), new_target.index)) - print("{}continue: (is_new: {}) or (recurse: {})".format(depth, is_new, recurse)) - if is_new or recurse: - restore_edges(new_target, new_shift_map, depth + " ") - - self.clear_edges(state, maybe_unreachable_set) - for term, target_id in edges.items(): - self.add_edge(state, term, target_id) - if self.debug_info: - print("{}replaced by {}\n".format(depth, state)) - state = self.states[s] - restore_edges(state, shift_map, "") + self.restore_edges(state, shift_map, maybe_unreachable_set) self.remove_unreachable_states(maybe_unreachable_set) def fix_inconsistent_state(self, s: StateId, verbose: bool) -> bool: @@ -1273,11 +1491,24 @@ def fix_inconsistent_state(self, s: StateId, verbose: bool) -> bool: return False all_reduce = all(a.update_stack() for a, _ in state.epsilon) + any_conditional = any(a.is_condition() and a.can_negate() for a, _ in state.epsilon) any_shift = (len(state.terminals) + len(state.nonterminals) + len(state.errors)) > 0 + try_with_conditionals = any_conditional try_with_context = all_reduce and not any_shift try_with_lookahead = not try_with_context # if verbose: # print(aps_lanes_str(aps_lanes, "fix_inconsistent_state:", "\taps")) + if try_with_conditionals: + if verbose: + print("\tFix with conditionals") + fixed = self.try_fix_with_conditions(s) + if fixed: + try_with_context = False + try_with_lookahead = False + elif verbose and try_with_context: + print("\tFallback on fixing with context.") + elif verbose and try_with_context: + print("\tFallback on fixing with lookahead.") if try_with_context: if verbose: print("\tFix with context.") @@ -1293,6 +1524,7 @@ def fix_inconsistent_state(self, s: StateId, verbose: bool) -> bool: aps_lanes = self.lookahead_lanes(s) assert aps_lanes != [] self.fix_with_lookahead(s, aps_lanes) + assert not state.is_inconsistent() return True def fix_inconsistent_table(self, verbose: bool, progress: bool) -> None: @@ -1320,54 +1552,41 @@ def fix_inconsistent_table(self, verbose: bool, progress: bool) -> None: def visit_table() -> typing.Iterator[None]: nonlocal count - unreachable = [] while todo: - while todo: - yield # progress bar. - # TODO: Compare stack / queue, for the traversal of the states. - s = todo.popleft() - if not self.is_reachable_state(s): - # NOTE: We do not fix unreachable states, as we might - # not be able to compute the reduce actions. However, - # we should not clean edges not backedges as the state - # might become reachable later on, since states are - # shared if they have the same locations. - unreachable.append(s) - continue - assert self.states[s].is_inconsistent() - start_len = len(self.states) - if verbose: - count = count + 1 - print("Fixing state {}\n".format(self.states[s].stable_str(self.states))) - try: - self.fix_inconsistent_state(s, verbose) - except Exception as exc: - self.debug_info = True - raise ValueError( - "Error while fixing conflict in state {}\n\n" - "In the following grammar productions:\n{}" - .format(self.states[s].stable_str(self.states), - self.debug_context(s, "\n", "\t")) - ) from exc - new_inconsistent_states = [ - s.index for s in self.states[start_len:] - if s.is_inconsistent() - ] - if verbose: - print("\tAdding {} states".format(len(self.states[start_len:]))) - print("\tWith {} inconsistent states".format(len(new_inconsistent_states))) - todo.extend(new_inconsistent_states) - - # Check whether none of the previously inconsistent and - # unreahable state became reachable. If so add it back to the - # todo list. - still_unreachable = [] - for s in unreachable: - if self.is_reachable_state(s): - todo.append(s) - else: - still_unreachable.append(s) - unreachable = still_unreachable + yield # progress bar. + # TODO: Compare stack / queue, for the traversal of the states. + s = todo.popleft() + if self.states[s] is None: + continue + assert self.states[s].is_inconsistent() + start_len = len(self.states) + if verbose: + count = count + 1 + print("Fixing state {}\n".format(self.states[s].stable_str(self.states))) + try: + self.fix_inconsistent_state(s, verbose) + except Exception as exc: + self.debug_info = True + raise ValueError( + "Error while fixing conflict in state {}\n\n" + "In the following grammar productions:\n{}" + .format(self.states[s].stable_str(self.states), + self.debug_context(s, "\n", "\t")) + ) from exc + new_inconsistent_states = [ + s.index for s in self.states[start_len:] + if s.is_inconsistent() + ] + if verbose: + print("\tAdding {} states".format(len(self.states[start_len:]))) + print("\tWith {} inconsistent states".format(len(new_inconsistent_states))) + + todo.extend(new_inconsistent_states) + self.mark_sweep_states() + if not todo: + for state in self.states: + if state is not None and state.is_inconsistent(): + todo.append(state.index) consume(visit_table(), progress) if verbose: @@ -1382,6 +1601,7 @@ def visit_table() -> typing.Iterator[None]: if verbose: print("Fix Inconsistent Table Result:") self.debug_dump() + self.debug_info = False def remove_all_unreachable_state(self, verbose: bool, progress: bool) -> None: self.states = [s for s in self.states if s is not None]