diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..b25c15b8 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ diff --git a/README.md b/README.md index 33253aab..fcbb5545 100644 --- a/README.md +++ b/README.md @@ -62,40 +62,45 @@ example, it can turn CSS code like

This is an ongoing project to provide software that lets you treat any parsable format as if it were XML, without the need for markup.

-

There are currently five papers:

+

Resources

+ + +

There are five papers on the development:

-

Software to support ixml will be made available at a later date, at https://github.com/invisibleXML/ixml

- -

The draft Specification for Invisible -XML is available.

+

Software to support ixml will be made available at a later date, but it + is already usable via a link; see the tutorial for details.

diff --git a/ixml-specification.html b/ixml-specification.html index b2cb11de..37e69caf 100644 --- a/ixml-specification.html +++ b/ixml-specification.html @@ -1,27 +1,25 @@ - + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns="http://www.w3.org/1999/xhtml"> Invisible XML Specification - + body {margin-left: auto; margin-right:auto; max-width: 50em;} + h1, h2 {font-family: sans-serif; clear: both} + pre {margin-left: 2em; padding: 0.5em 0 0.5em 1em; background-color: #ddf} + code {color: #A00} + div {border: thin red solid} + span {background: yellow} + img {float: right; width: 25%; margin-left: 1em; border: thin black solid} + @@ -29,7 +27,7 @@

Invisible XML Specification (Draft)

Editor: Steven Pemberton, CWI, Amsterdam

-

Version: 2021-11-23

+

Version: 2022-02-22

Status

@@ -37,7 +35,8 @@

Status

@@ -160,7 +160,7 @@

Introduction

</url>

or

-
<url scheme='http'>://
+
<url scheme='http'>
    <host>www.w3.org</host>
    <path>/TR/1999/xhtml.html</path>
 </url>
@@ -187,7 +187,7 @@

How it works

in the grammar affect details of this serialisation, excluding parts of the tree, or serialising parts as attributes instead of elements.

-

As an example, consider this simplified grammar for URLs:d

+

As an example, consider this simplified grammar for URLs:

url: scheme, ":", authority, path.
 
 scheme: letter+.
@@ -268,7 +268,7 @@ 

How it works

url: scheme, -":", authority, path.

and

-
authority: "//", host.authority: "//", host.s;
+
authority: "//", host.

to

authority: -"//", host.
@@ -388,21 +388,21 @@

Terminals

encoded.

A quoted string is an optionally marked string of one or more characters, -enclosed with single or double quotes. The enclosing quote is represented in a -string by doubling it. A quoted string matches only the exact same string in -the input. Examples: "yes" 'yes'.

- -

These two strings are identical: 'Isn''t it?' "Isn't it?"

-
 -quoted: (tmark, s)?, -string.
+enclosed with single or double quotes. A quoted string matches only the exact
+same string in the input. Examples: "yes" 'yes'. 

+ +

A string may not extend over a line-break. The enclosing quote is +represented in a string by doubling it; these two strings are identical: +'Isn''t it?' "Isn't it?", as are these: "He said ""Don't!""" +'He said "Don''t!".

+
 -quoted: (tmark, s)?, string.
   @tmark: ["^-"].
-  string: -'"', dstring, -'"', s;
-          -"'", sstring, -"'", s.
-@dstring: dchar+.
-@sstring: schar+.
-   dchar: ~['"'];
-          '"', -'"'. {all characters, quotes are doubled}
-   schar: ~["'"];
-          "'", -"'". {all characters, quotes are doubled}
+ @string: -'"', dchar+, -'"', s; + -"'", schar+, -"'", s. + dchar: ~['"'; #a; #d]; + '"', -'"'. {all characters except line breaks; quotes must be doubled} + schar: ~["'"; #a; #d]; + "'", -"'". {all characters except line breaks; quotes must be doubled}

An encoded character is an optionally marked hexadecimal number. It represents a single character and matches that character in the input. It @@ -445,24 +445,24 @@

Character sets

A range matches any character in the range from the start character to the end, inclusive, using the Unicode ordering:

-
range: from, -"-", s, to.
+
range: from, s, -"-", s, to, s.
 @from: character.
   @to: character.

A character is a string of length one, or a hex encoded character:

-
-character: -'"', dchar, -'"', s;
-            -"'", schar, -"'", s;
-            "#", hex, s.
+
-character: -'"', dchar, -'"';
+            -"'", schar, -"'";
+            "#", hex.

A class is two letters, representing any character from the Unicode character category [Categories] of that name. E.g. [Ll] matches any lower-case letter, [Ll; Lu] matches any upper- or lower-case character; it is an error if there is no such class.

-
  class: code, s.
+
   class: code, s.
    @code: capital, letter?.
 -capital: ["A"-"Z"].
--letter: ["a"-"z"].
+ -letter: ["a"-"z"].

Complete

         ixml: s, rule+s, s.
@@ -496,21 +496,19 @@ 

Complete

charset. literal: quoted; encoded. - -quoted: (tmark, s)?, -string. + -quoted: (tmark, s)?, string. @name: namestart, namefollower*. -namestart: ["_"; L]. -namefollower: namestart; ["-.·‿⁀"; Nd; Mn]. @tmark: ["^-"]. - string: -'"', dstring, -'"', s; - -"'", sstring, -"'", s. - @dstring: dchar+. - @sstring: schar+. - dchar: ~['"']; - '"', -'"'. {all characters, quotes must be doubled} - schar: ~["'"]; - "'", -"'". {all characters, quotes must be doubled} + @string: -'"', dchar+, -'"', s; + -"'", schar+, -"'", s. + dchar: ~['"'; #a; #d]; + '"', -'"'. {all characters except line breaks; quotes must be doubled} + schar: ~["'"; #a; #d]; + "'", -"'". {all characters except line breaks; quotes must be doubled} -encoded: (tmark, s)?, -"#", @hex, s. hex: ["0"-"9"; "a"-"f"; "A"-"F"]+. @@ -522,12 +520,12 @@

Complete

-member: literal; range; class. - range: from, -"-", s, to. + range: from, s, -"-", s, to, s. @from: character. @to: character. - -character: -'"', dchar, -'"', s; - -"'", schar, -"'", s; - "#", hex, s. + -character: -'"', dchar, -'"'; + -"'", schar, -"'"; + "#", hex. class: code, s. @code: capital, letter?. -capital: ["A"-"Z"]. @@ -536,18 +534,18 @@

Complete

Parsing

The root symbol of the grammar is the name of the first rule in the grammar. -If it is marked as hidden, all of its productions must produce exactly one -non-hidden nonterminal and no non-hidden terminals before or after that -nonterminal (in order to match the XML requirement of a single-rooted -document).

+It may not be marked as an attribute. If it is marked as hidden, all of its +productions must produce exactly one non-hidden non-attribute nonterminal and +no non-hidden terminals before or after that nonterminal (in order to match the +XML requirement of a single-rooted document).

Processors must accept and parse any conforming grammar, and produce at -least one parse of any input that conforms to the grammar starting at the root +least one parse of supplied input that matches the grammar starting at the root symbol. If more than one parse results, one is chosen; it is not defined how -this choice is made, but the resulting parse must be marked as ambiguous by +this choice is made, but the resulting parse should be marked as ambiguous by including the attribute ixml:state="ambiguous" on the document element of the serialisation. The ixml namespace URI is -"http://invisiblexml.org/NS".

+"http://invisiblexml.org/NS".

Serialisation

@@ -602,9 +600,9 @@

Serialisation

<right>b</right> </expr>
-

Hints for Implementors

+

Hints for Implementers

-

Many parsing algorithms only mention terminals, and nonterminals, and don't +

Many parsing algorithms only mention terminals and nonterminals, and don't explain how to deal with the repetition constructs used in ixml. However, these can be handled simply by converting them to equivalent simple constructs. In the examples below, f and sep are @@ -613,27 +611,23 @@

Hints for Implementors

Optional factor:

f? ⇒ f-option
--f-option: f; .
+-f-option: f; ().

Zero or more repetitions:

f* ⇒ f-star
--f-star: f, f-star; .
+-f-star: (f, f-star)?.

One or more repetitions:

f+ ⇒ f-plus
--f-plus: f, f-star.
--f-star: f, f-star; .
+-f-plus: f, f*.

One or more repetitions with separator:

f+sep ⇒ f-plus-sep
--f-plus-sep: f, sep-part-option. 
--sep-part-option: sep, f-plus-sep; .
+-f-plus-sep: f, (sep, f)*.

Zero or more repetitions with separator:

f*sep ⇒ f-star-sep
--f-star-sep: f-plus-sep; .
--f-plus-sep: f, sep-part-option.
--sep-part-option: sep, f-plus-sep; .
+-f-star-sep: (f+sep)?.

IXML in IXML

@@ -703,7 +697,7 @@

IXML in IXML

</rule> <rule name='comment'> <alt> - <literal tmark='-' dstring='{'/> + <literal tmark='-' string='{'/> <repeat0> <alts> <alt> @@ -714,13 +708,13 @@

IXML in IXML

</alt> </alts> </repeat0> - <literal tmark='-' dstring='}'/> + <literal tmark='-' string='}'/> </alt> </rule> <rule mark='-' name='cchar'> <alt> <exclusion> - <literal dstring='{}'/> + <literal string='{}'/> </exclusion> </alt> </rule> @@ -737,29 +731,27 @@

IXML in IXML

<nonterminal name='name'/> <nonterminal name='s'/> <inclusion tmark='-'> - <literal dstring='=:'/> + <literal string='=:'/> </inclusion> <nonterminal name='s'/> <nonterminal mark='-' name='alts'/> - <literal tmark='-' dstring='.'/> + <literal tmark='-' string='.'/> </alt> - </rule + </rule>

Conformance

-

In this specification, the verb "must" expresses unconditional requirements -for conformance to the specification; the verb "should" expresses requirements -that are encouraged but which are not conditions of conformance; the verb "may" -expresses optional features which are neither required nor prohibited.

+

In this specification, the verb "must" expresses unconditional +requirements for conformance to the specification; the verb "should" expresses +requirements that are encouraged but which are not conditions of conformance; +the verb "may" expresses optional features which are neither required nor +prohibited.

Conformance to this specification can meaningfully be claimed for grammars -and for processors.

- -

Note: although input described by a grammar is sometimes described as -"obeying" or "conforming to" the grammar, conformance to this specification -cannot be claimed of input streams or of input + grammar pairs.

+and for processors; it cannot be claimed for input streams or input + grammar +pairs.

-

Conformance of grammars

+

Conformance of grammars

An ixml grammar in ixml form conforms to this specification if

Note: The normative formulations of conformance requirements are those given -elsewhere in this specification. But for convenience the requirements that go +elsewhere in this specification. For convenience the requirements that go beyond what is expressed in the grammar itself can be summarized as follows. -Reasonable effort has been used to make this list complete, but omission of any -conformance requirement from this list does not affect its status as a -conformance requirement.

+(Reasonable effort has been used to make this list complete, but omission of +any conformance requirement from this list does not affect its status as a +conformance requirement.)

-

Conformance of processors

+

Conformance of processors

-

A processor conforms to this specification if it accepts grammars in ixml -{or XML?} form and uses those grammars to parse input and produce -XML documents representing serialized trees as specified elsewhere in this -specification. A conforming processor must not accept non-conforming -grammars.

- -

In addition to requirements mentioned elsewhere in this specification, the -following also apply to conforming processors:

+

A conforming processor must accept grammars in ixml form, and should accept +grammars in XML form. A conforming processor must not accept non-conforming +grammars. For any conforming grammar and any input:

+

Note: the requirements require that grammars be processed by an algorithm +that accepts and parses any context-free grammar; known algorithms of this +class include [Earley], [Unger], [CYK], [GLR], and [GLL]; +see also [Grune].

+

References

-

[Unicode] The Unicode Consortium (ed.), The Unicode Standard -— Version 13.0. Unicode Consortium, 2020, ISBN 978-1-936213-26-9, [Unicode] The Unicode Consortium (ed.), The Unicode +Standard — Version 13.0. Unicode Consortium, 2020, ISBN +978-1-936213-26-9, http://www.unicode.org/versions/Unicode13.0.0/

+

[Properties] ibid. Chapter 4, Unicode Character Propertieshttps://www.unicode.org/versions/Unicode13.0.0/ch04.pdf

--> -

[Categories] General Category Values [Categories] The Unicode Consortium (ed.), Unicode +Standard Annex #44: Unicode Character Database -- General Category Values +https://unicode.org/reports/tr44/#General_Category_Values (See also http://www.fileformat.info/info/unicode/category/index.htm)

-

[XML] Tim Bray et al. (eds.), Extensible Markup Language (XML) 1.0 -(Fifth Edition), W3C, 2008, [XML] Tim Bray et al. (eds.), Extensible Markup +Language (XML) 1.0 (Fifth Edition), W3C, 2008, https://www.w3.org/TR/xml/

-

Informational References

- -

[Earley] Earley, Jay (1970), "An efficient context-free parsing -algorithm", Communications of the ACM, 13 (2): 94–102, doi:10.1145/362007.362035

+

Informational References

-

[Unger] Unger, S. H. A global parser for context-free phrase -structure grammars. Commun. ACM, 11(4):240–247, April 1968

+

[CYK] Sakai, Itiroo. Syntax in universal translation. In +1961 International Conference on Machine Translation of Languages and Applied +Language Analysis, pages 593–608. https://aclanthology.org/www.mt-archive.info/50/NPL-1961-Sakai.pdf

-

[CYK] Sakai, Itiroo. Syntax in universal translation. In 1961 -International Conference on Machine Translation of Languages and Applied -Language Analysis, pages 593–608

+

[Earley] Earley, J. An efficient context-free parsing +algorithm. Communications of the ACM, 13(2):94–102, February 1970, doi:10.1145/362007.362035

-

[GLR] Masaru Tomita. Generalized LR Parsing. Springer Science & -Business Media. ISBN 978-1-4615-4034-2.

+

[GLL] Elizabeth Scott and Adrian Johnstone, GLL Parsing. +Electronic Notes in Theoretical Computer Science, Volume 253, Issue 7, 17 +September 2010, pages 177-189. doi:10.1016/j.entcs.2010.08.041

-

[GLL] GLL Parsing Elizabeth Scott and Adrian Johnstone Electronic -Notes in Theoretical Computer Science, Volume 253, Issue 7, 17 September 2010, -Pages 177-189

+

[GLR] Masaru Tomita. Generalized LR Parsing. Springer +Science & Business Media. ISBN 978-1-4615-4034-2. doi:10.1007/978-1-4615-4034-2

-

[Grune] Grune, D. and Jacobs, C. (2008). Parsing techniques : a -practical guide (2nd ed.). New York: Springer. p. 579. ISBN 978-0-387-20248-8. -[Grune] Grune, D. and Jacobs, C. Parsing techniques : a +practical guide (2nd ed.). New York: Springer, 2008. ISBN +978-0-387-20248-8. https://dickgrune.com/Books/PTAPG_2nd_Edition/CompleteList.pdf

-

Acknowledgments

+

[Unger] Unger, S. H. A global parser for context-free phrase +structure grammars. Communications of the ACM, 11(4):240–247, April +1968, doi:10.1145/362991.363001

+ +

Acknowledgements

This specification was produced by the W3C ixml community group, that at the time of publishing consisted of the members: {list of names}

diff --git a/ixml.ixml b/ixml.ixml index 2751c401..57680380 100644 --- a/ixml.ixml +++ b/ixml.ixml @@ -1,4 +1,7 @@ - ixml: s, rule+s, s. + ixml: prolog, rule+s, s. + + prolog: s, ppragma+s, s. + -ppragma: pragma, s, -".". -s: (whitespace; comment)*. -whitespace: -[Zs]; tab; lf; cr. @@ -8,7 +11,9 @@ comment: -"{", (cchar; comment)*, -"}". -cchar: ~["{}"]. - rule: (mark, s)?, name, s, -["=:"], s, -alts, -".". + rule: annotation, name, s, -["=:"], s, -alts, -".". + -annotation: (pragma, sp?), (mark, sp)?. + -sp: (s; pragma)*. @mark: ["@^-"]. alts: alt+(-[";|"], s). alt: term*(-",", s). @@ -23,34 +28,32 @@ repeat1: factor, -"+", s, sep?. option: factor, -"?", s. sep: factor. - nonterminal: (mark, s)?, name, s. + nonterminal: annotation, name, s. -terminal: literal; charset. literal: quoted; encoded. - -quoted: (tmark, s)?, -string. + -quoted: tannotation, -string. @name: namestart, namefollower*. -namestart: ["_"; L]. -namefollower: namestart; ["-.·‿⁀"; Nd; Mn]. @tmark: ["^-"]. - string: -'"', dstring, -'"', s; - -"'", sstring, -"'", s. - @dstring: dchar+. - @sstring: schar+. - dchar: ~['"']; - '"', -'"'. {all characters, quotes must be doubled} - schar: ~["'"]; - "'", -"'". {all characters, quotes must be doubled} - -encoded: (tmark, s)?, -"#", @hex, s. + @string: -'"', dchar+, -'"', s; + -"'", schar+, -"'", s. + dchar: ~['"'; #a; #d]; + '"', -'"'. {all characters except line breaks; quotes must be doubled} + schar: ~["'"; #a; #d]; + "'", -"'". {all characters except line breaks; quotes must be doubled} + -encoded: tannotation, -"#", @hex, s. hex: ["0"-"9"; "a"-"f"; "A"-"F"]+. -charset: inclusion; exclusion. - inclusion: (tmark, s)?, set. - exclusion: (tmark, s)?, -"~", s, set. + inclusion: tannotation, set. + exclusion: tannotation, -"~", s, set. -set: -"[", s, member*(-[";|"], s), -"]", s. -member: literal; range; @@ -65,3 +68,15 @@ @code: capital, letter?. -capital: ["A"-"Z"]. -letter: ["a"-"z"]. + + pragma: -"[", @pmark?, @pname, (s, pragma-data)?, -"]". + @pname: -QName; -UQName. + @pmark: ["@^?"]. + pragma-data: (-pragma-chars; -bracket-pair)*. +-pragma-chars: ~["[]"]*. +-bracket-pair: '[', -pragma-data, ']'. + + -QName: -name, ':', -name. + -UQName: 'Q{', -ns-name, '}', -name. + -ns-name: ~["{}"; '"'; "'"]* + diff --git a/ixml.xml b/ixml.xml index 9a8a0fd7..933100d6 100644 --- a/ixml.xml +++ b/ixml.xml @@ -1,3 +1,4 @@ + @@ -58,7 +59,7 @@ - + @@ -69,13 +70,13 @@ - + - + @@ -92,17 +93,17 @@ - + - + - + @@ -114,7 +115,7 @@ - + @@ -130,7 +131,7 @@ - + @@ -160,17 +161,17 @@ - + - + - + - + @@ -253,7 +254,7 @@ - + @@ -264,7 +265,7 @@ - + @@ -273,62 +274,56 @@ - + - - - - - - - - - - - - - - - + + + + - - + + + - + + + - - + + - all characters, quotes must be doubled + all characters except line breaks; quotes must be doubled - + + + - - + + - all characters, quotes must be doubled + all characters except line breaks; quotes must be doubled - + @@ -386,14 +381,14 @@ - + - + @@ -401,14 +396,14 @@ - + - + @@ -426,9 +421,11 @@ - + + + @@ -443,21 +440,18 @@ - + - - + - + - - + - + - diff --git a/misc/pragmas-proposal.md b/misc/pragmas-proposal.md new file mode 100644 index 00000000..526b22ba --- /dev/null +++ b/misc/pragmas-proposal.md @@ -0,0 +1,430 @@ +# A namespaces + pragmas proposal for ixml + +Tom Hillman, Michael Sperberg-McQueen + +15 December 2021, rev. 4 January 2022 + +This document describes a proposal for adding namespace declarations +and pragmas to ixml. + +For background, use cases, and some discussion of design choices, see +the document [pragmas.md](pragmas.md) in this directory. In that +document, the proposal made here is referred to as namespaces proposal +S and pragmas proposal F. + +We believe that the examples given in [pragmas.md](pragmas.md) +demonstrate that the proposal outlined here satisfies the known use +cases for pragmas. + +## Pragmas + +Pragmas are a syntactic device to allow grammar writers to communicate +with processors in non-standard ways without interfering with the operation of other +processors. To avoid interference with other processors, two +requirements arise: + +* Pragmas must be syntactically identifiable as such. + +* Also, it must be possible for processors to distinguish pragmas +directed at them from other pragmas. This proposal uses namespaces, +QNames, and URI-qualified names to allow grammar writers and +implementations to avoid collisions. + +Pragmas may affect the behavior of a processor in any way, either in +ways that leave the meaning of a grammar unchanged or in ways that +change the meaning of the grammar in which the pragmas appear. + +Since specific pragmas will be understood by some processors and not +others, it is necessary in this proposal to discuss both how an +example might be processed by a processor which understands and +implements the pragma and how it will be processed by a processor +oblivious to it. (See below for the terms *understand* and +*oblivious*.) + + +### Syntax in ixml + +In ixml, pragmas are enclosed in square brackets, which contain an +optional mark, a qualified name in some form, and optionally +additional data, which takes the form of a sequence of +square-bracket-balanced characters. The relevant part of the ixml +grammar is: + +```` +pragma: -"[", @pmark?, @pname, (whitespace, pragma-data)?, -"]". +@pname: -QName; -UQName. +@pmark: ["@^?"]. +pragma-data: (-pragma-chars; -bracket-pair)*. +-pragma-chars: ~["[]"]+. +-bracket-pair: '[', -pragma-data, ']'. + +-QName: -name, ':', -name. +-UQName: 'Q{', -ns-name, '}', -name. +-ns-name: ~["{}"; '"'; "'"]* +```` + +For example, the following are all syntactically well formed pragmas: + +* `[?my:blue]` +* `[?Q{http://example.org/NS/mine}blue]` +* `[@my:color blue]` + +It is a consequence of the syntax that pragmas can contain nested +pairs of square brackets. + +```` +[ls:rewrite + comment: -"{", (cchars; comment)*, -"}". + [ls:token] -cchars: cchar+. +] +```` + +Here, in fact, the pragma contains a nested pragma, though the nesting +is only apparent to a processor which understands the *ls:rewrite* +pragma and knows to parse its pragma data as a sequence of rules in +ixml notation. A processor which does *not* understand the +*ls:rewrite* pragma will merely know that the pragma data here +contains 100 characters, which happen to include one nested pair of +brackets. That suffices. + +Pragmas may appear: + +* immediately before a terminal or nonterminal symbol in the +right-hand side of a rule, before or after its mark if any, or + +* immediately before the nonterminal symbol on the left-hand side of a +rule, before or after its mark if any, or + +* after the final alternative of a rule, before the full stop ending +the rule, or + +* before the first rule of the grammar. + +In the final case, each pragma must be followed by a full stop. + +Each of these requires some changes to the grammar of ixml. To allow +pragmas immediately before symbols, we change the grammatical +definitions of symbols, both nonterminals: + +```` +nonterminal: annotation, name, s. +-annotation: (pragma, sp)?, (mark, sp)?. +-sp: (s; pragma)*. +```` + +and terminals: + +```` +-quoted: tannotation, -string. +-encoded: tannotation, -"#", @hex, s. +inclusion: tannotation, set. +exclusion: tannotation, -"~", s, set. +-tannotation: (pragma, sp)?, (tmark, sp)?. +```` + +To allow pragmas on the left-hand side of a rule and before its +closing full stop, we modify the definition of *rule*: + +```` +rule: annotation, name, s, -["=:"], s, -alts, (pragma, sp)?, -".". +```` + +To allow pragmas before the first rule and to distinguish them from +pragmas occurring on the left-hand side of the first rule, we modify +the definition of *ixml* to introduce a *prolog*. + +```` +ixml: prolog, rule+s, s. +prolog: s, ppragma+s, s. +-ppragma: pragma, s, -'.'. +```` + + +### Syntax in XML + +Following the normal rules of ixml, pragmas are serialized as elements +named `pragma`, with attributes named `pmark` and `pname` and an +optional child element named `pragma-data`. In addition, in XML +grammars `pragma` elements may contain any number of XML elements +following the `pragma-data` element. + +For example: + +```` + +```` + +or + +```` + +```` + +or + +```` + + blue + +```` + +or + +```` + + comment: -"{", (cchars; comment)*, -"}". + [ls:token] -cchars: cchar+. + + +```` + +Pragma-oblivious processors and processors which do not implement the +pragma in question will as a matter of course produce `pragma` +elements with just the one child element (or none). But processors +which implement a given pragma are free to inject additional XML +elements into the XML form of the pragma. It is to be assumed that the +XML elements contain no additional information, only a mechanically +derived XML form which makes the information in the pragma easier to +process. It is to be expected that any software to serialize XML +grammars in ixml form will discard the additional XML elements. + +For example, note that a processor which understands the *ls:rewrite* +pragma might prefer to produce a different XML representation for it, +e.g. one in which the embedded grammar rules have their normal XML +representation. As noted above: pragmas may affect the behavior of a +processor in any way. For such a processor, the XML representation +might be: + +```` + + comment: -"{", (cchars; comment)*, -"}". + [ls:token] -cchars: cchar+. + + + + + + + + + + + + + + + + + + + + +```` + +Note that because the additional XML elements within the pragma are +just redundant XML representations of the pragma data, a +pragma-oblivious application to rewrite XML grammars in ixml form will +lose no information when transcribing this XML pragma as + +```` +[ls:rewrite + comment: -"{", (cchars; comment)*, -"}". + [ls:token] -cchars: cchar+. +] +```` + +It should be noted that the *pmark* allowed by the syntax has no +effect on the XML representation produced by the core rules of ixml. +Pragma-oblivious processors will always produce XML representation of +pragmas of the form described here. Pragma-aware processors may +implement pragmas which modify the standard XML representation +('pragmas for pragmas'). See [pragmas.md](pragmas.md) for an +example. + + +### Pragma scope + +In this proposal, pragmas always apply explicitly to some part of a +grammar: + +* to a symbol occurrence in the right-hand side of a rule, or + +* to a rule + +* to the grammar as a whole + +The relation between a pragma and the part of the grammar to which it +applies is reflected in the XML form of a grammar: pragmas appear as +child elements of the part of the grammar they apply to (an element +named `ixml`, `rule`, `nonterminal`, `literal`, `inclusion`, or +`exclusion`). + +These associations between pragmas and parts of grammars are specified +here for clarity and to enable clearer discussion of pragmas, but they +have no effect on the operational semantics of ixml processors. A +pragma-oblivious processor will not be affected by the pragmas, +regardless of what they apply to, and a processor that understands a +given pragma will know from its definition what changes in behavior it +requests. The associations given above are thus of most direct use to +those specifying the meaning of specific pragmas. + + +### Operational semantics + +In describing the operational semantics of pragmas, we distinguish +different classes of ixml processor: + +* *pragma-oblivious* processors recognize pragmas syntactically but +otherwise ignore them all, except for namespace +declarations. Informally, they do not 'understand' any pragmas, and +their only obligation is not to trip over pragmas when they encounter +them. + +* *pragma-aware* processors recognize pragmas syntactically and modify +their behavior in accordance with some pragmas. Informally, they +'understand' some pragmas but not all. For each pragma they recognize, +they must determine whether it is one they 'understand' and implement, +or not. + +With regard to a given pragma, processors either *implement* that +pragma or they do not. A processor *implements* a pragma if and only if it +adjusts its behavior as specified by that pragma. In the ideal case +there will be some written specification of the pragma which describes +the operational effect of the pragma clearly. This proposal assumes +that a processor can use the qualified name of a pragma to determine +whether the processor implements the pragma or not and thus decide +whether to modify its normal operation or not. + +The obligation of pragma-oblivious processors is to accept pragmas +when they occur in the ixml form of a grammar, and (if they are +producing an XML form of the grammar) to produce the correct XML form of +each pragma, just as they produce the corresponding XML form for any +construct in the grammar. + +Pragma-aware processors MUST similarly accept pragmas when they +occur in the ixml form of a grammar, and (if they are producing an XML +form of the grammar) produce the correct XML form of each pragma, just +as they produce the corresponding XML form for any construct in the +grammar. As already noted, however, pragmas may modify this behavior +like any other. + +### Conformance requirements for pragmas + +Processors MUST be capable, at user option, of ignoring all pragmas +other than namespace declarations and processing a grammar using the +standard rules of ixml. + +Processors which accept ixml grammars MUST accept pragmas in the ixml +form of a grammar, whether they understand or implement the specific +pragmas or not. + +Processors which accept XML grammars MUST accept pragmas in the XML +form of a grammar, whether they understand or implement the specific +pragmas or not. + +If a pragma which the processor does not understand or implement is +present in a grammar used to parse input, the processor MUST process +the grammar in the same way as if the pragma were not present. + +When ixml grammars are processed as input using the grammar for ixml, +processors MUST produce the correct XML form of each pragma, just as +they produce the corresponding XML form for any construct in the +grammar, *except* as the processor's behavior is affected by the +presence of pragmas in the grammar for ixml used to parse the input. + + +## Namespace declarations + +Namespace declarations take the form of a pragma appearing in the +prolog of a grammar and using the reserved prefix *ixmlns* in their +QName. Their pragma data is interpreted an an IRI. + +For example the following namespace declarations bind the prefix +"`xsd`" to the namespace for the XSD schema definition language, and +"`rng`" to that for Relax NG: + +```` +[ixmlns:xsd http://www.w3.org/2001/XMLSchema] +[ixmlns:rng http://relaxng.org/ns/structure/1.0] +```` + +As is the case for for XML namespaces generally, the pragma data +SHOULD be a legal URI, but ixml processors are not obligated to check +the URI for syntactic correctness (although they are may do so), and +normally SHOULD NOT attempt to dereference it. + +The effect of a namespace declaration is to bind the local part of the +QName to the given namespace and allow it to be used as a prefix in +QNames to denote qualified names in the given namespace. + +The following rules apply: + +* The prefix *ixmlns* is understood by all conforming ixml software as + bound to the namespace-binding namespace + '`http://example.com/ixml-namespaces`". + +* A pragma with the unprefixed *ixmlns* is interpreted as defining a + default namespace. + +* All namespace declarations pertain to the grammar as a whole and + MUST be given before the first rule of the grammar. + +* No two namespace declarations may bind the same prefix. + +* A nonterminal taking the lexical form of a QName MUST if serialized + be serialized as an XML element name with the same local name and + with a prefix bound to the same namespace. Normally the prefix + SHOULD be as given in the grammar. *(If all namespaces are declared + before the first rule, there should be no reason it should be + impossible to use the same prefix. Perhaps we can make this a + 'MUST'.)* + + The ixml processor is responsible for including appropriate + namespace declarations in the XML output. + +* In the XML form of an ixml grammar, all namespaces bound in in the + ixml grammar SHOULD be bound in the XML form of the grammar. + + This should normally take the form of namespace declarations on the + `ixml` element. + + +## Appendix: Possible variations + +Some obvious variations on this proposal can be listed. + +* Instead of saying that pragma-oblivious processors must ignore all + pragmas, we could say that they that they must ignore all pragmas + except namespace declarations. + +* Instead of saying processors MUST be able to ignore all pragmas, we + might say they SHOULD be able to ignore all pragmas. + +* Instead of forbidding two namespace declarations for the same + prefix, we could say they MUST agree, or that the first one wins (as + for entity declarations) or that the last one wins (as for multiple + `let` clauses for the same variable in a FLWOR expression in + XQuery). + +* Instead of requiring that in the XML form of an ixml grammar all + namespaces bound in the ixml grammar SHOULD be bound in the XML + form of the grammar, we could say that those actually used MUST be + bound, or that they all MUST be bound. + +* Instead of using the prefix *ixmlns*, namespace declarations could + use *xmlns*. Strictly speaking, however, that name is reserved and + it's not clear we have the standing to use it here. + +* Instead of using pragma syntax, namespace declarations could use + some other syntax. E.g. + + prolog: s, (ppragma; namespace)+s, s. + namespace: -'declare', s, -'namespace', s, + @prefix, s, -'=', s, @ns-name, s, '.'. + @prefix: name. + @ns-name: string. + + In this case, the examples shown above would take the form + + declare namespace xsd = "http://www.w3.org/2001/XMLSchema". + declare namespace rng = "http://relaxng.org/ns/structure/1.0". diff --git a/misc/pragmas.md b/misc/pragmas.md new file mode 100644 index 00000000..0937d525 --- /dev/null +++ b/misc/pragmas.md @@ -0,0 +1,1576 @@ +# Pragmas for ixml + +2021-11-16, rev. most recently 2021-12-04 + +**Note, 2022-01-02:** A more compact proposal for +pragmas is [elsewhere in this directory](proposal-S-F.md); +it presents only variants S and F of this proposal and omits +much of the design discussion and all of the samples showing +how pragmas can be used to solve specific use cases. This +document remains relevant because it covers those topics, +but for current details of the proposed syntax for pragmas +and namespace declarations, see the other document. + +This document describes a proposal for adding *pragmas* to the +Invisible-XML specification. + +It was prepared by Tom Hillman and +Michael Sperberg-McQueen. It is currently as complete as we expect to make it. + +The general idea of pragmas is to provide a channel for information +that is not a required part of the ixml specification but can be used +by some implementations to provide useful behavior, without +interfering with the operation of other implementations for which the +information is irrelevant. Pragmas can also be used to provide +optional features in the ixml specification. The additional +information contained in pragmas may be used to control options in a +processor or to extend the specification (in roughly the same way as +pragmas and structured comments in C or Pascal programs may be used to +control optimization levels in some compilers). + +On this view, pragmas are a form of annotation, and we use the terms +*pragma* and *annotation* accordingly. + +The proposal described here is inspired in part by the `xsl:fallback` +and `use-when` mechanisms of XSLT and the *extension expression* +and *annotation* mechanisms of XQuery. SGML and XML processing +instructions have also contributed to our thinking. + +In working out the proposal for pragmas, we have come to believe that +in order for pragmas to work as designed, some form of namespace +binding must be available in ixml. This could be done by inventing +new syntax for namespace bindings, but what we propose here is to use +the syntax of pragmas to declare namespace bindings: the net effect is +that the spec would (a) define a syntax for pragmas and (b) define one +particular set of pragmas that all ixml processors must support. + +This document thus includes both a proposal for pragmas and a proposal +for namespace binding, each of which assumes the other. Each proposal +has two variants (F and V for the pragmas proposal, U and S for the +namespaces proposal). + +Contents: +* [Use cases](#use-cases) +* [Requirements and desiderata](#requirements-and-desiderata) +* [Design questions](#design-questions) +* [Pragma proposal(s)](#pragma-proposals) + * [The brackets-QName proposal](#the-brackets-qname-proposal) + * [The ixml form](#the-ixml-form) + * [Marks on pragmas in V](#marks-on-pragmas-in-v) + * [Marks on pragmas in F](#marks-on-pragmas-in-f) + * [The XML form of pragmas in F](#the-xml-form-of-pragmas-in-f) + * [The XML form of pragmas in V](#the-xml-form-of-pragmas-in-v) + * [Pragmas and other extension mechanism](#pragmas-and-other-extension-mechanisms) + * [Annotating symbols, rules, or grammars](#annotating-symbols-rules-or-grammars) + * [An example](#an-example) +* [Worked examples](#worked-examples) + * [Namespace declarations](#namespace-declarations) + * [Renaming](#renaming) + * [Name indirection](#name-indirection) + * [Rule rewriting](#rule-rewriting) + * [Tokenization annotation and alternative formulations.](#tokenization-annotation-and-alternative-formulations) + * [Text injection](#text-injection) + * [Attribute grammar specification](#attribute-grammar-specification) + * [Pragmas for proposal V](#Pragmas-for-proposal-V) +* [Namespace binding proposals](#Namespace-binding-proposals) + * [Namespace binding, common rules](#namespace-binding-common-rules) + * [Namespace binding in proposal U](#namespace-binding-in-proposal-u) + * [Namespace binding in proposal S](#namespace-binding-in-proposal-s) +* [Open issues](#open-issues) +* [Decisions to be made by the group](#decisions-to-be-made-by-the-group) +* [References](#references) + +## Use cases + +Among the use cases that motivate the proposal are these. + +Note that some of these use cases may in practice be handled by +changes to the core syntax of ixml. We include them in the list of use +cases for pragmas not because we think they are best handled by +pragmas but because they are (a) plausible ideas for things one might +want to do which are (b) not supported by ixml in its current form, +and thus (c) natural examples of the kinds of things an extension +mechanism like pragmas ought ideally to be able to support. + +* Namespace declarations. + + Using pragmas to specify that all or some elements of the XML +returned by an ixml processor should go into a specified namespace. + +* Renaming + + Using pragmas to specify that an element or attribute name +serializing a nonterminal should be given a name different from the +nonterminal itself. (As in Steven Pemberton's proposal for element +renaming.) + +* Name indirection + + Using pragmas to specify that an element or attribute name should be +taken not from the grammar but from the string value of a given +nonterminal. + +* Rule rewriting + + Using pragmas to specify that a rule as given is shorthand for a set +of other rules, which can be obtained by rewriting the rule as given. + +* Tokenization annotation + + Using pragmas to annotate nonterminals in an ixml grammar to +indicate that they (a) define a regular language and (b) can be safely +recognized by a greedy regular-expression match. + +* Alternative formulations + + Using pragmas to provide alternative formulations of rules in an +ixml grammar to allow different annotation or better optimization. + +* Text injection + + Using pragmas to indicate that a particular string should be +injected into the XML representation of the input as (part of) a text +node, or as an attribute or element. (This can help make the output of +an ixml parser conform to a pre-existing schema.) + +* Attribute grammar specification + + Using pragmas to annotate a grammar with information about +grammatical attributes to be associated with nodes of the parse tree, +whether they are inherited from an ancestor or an elder sibling or +synthesized from the children of a node, and what values should be +assigned to them. Grammatical attributes are not to be confused with +XML attributes, although in particular cases it may be helpful to +render a grammatical attribute as an XML attribute. + +*Are there other use cases that need to be mentioned here?* + +Some of these use cases seem most naturally handled by annotations +which apply to a grammar as a whole, some by annotations which apply +to individual rules, and some by annotations which apply to individual +symbols in the grammar. + +We do not see a strong use case for annotations which apply to +arbitrary expressions in a grammar. + +## Requirements and desiderata + +Our tentative list of requirements and desiderata is as follows. + +By *requirement* we mean a property or functionality which must be +achieved for a pragmas proposal to be worth adopting. +By *desideratum* we mean a property or functionality that should be +included if possible, but which does not doom the proposal to +pointlessness if it proves impossible to achieve. + +Requirements: + +* It must be straightforward for processors to ignore pragmas they do +not understand, and to determine whether they 'understand' a given +pragma or not. + +* It must be clear to human readers and software which expressions in +standard ixml notation are and are not affected or overridden by a +given pragma. + +* Any pragma must thus specify (explicitly or implicitly) both what +should be done by a processor that understands and processes the +pragma and what should be done by a processor that does not understand +and process the pragma. We refer to the latter as the *fallback +expression*. + + +Desiderata: + +* Ideally, the result of evaluating the fallback expression should be + a useful and meaningful result, but this is more a matter for the + individual writing a grammar than for this proposal. The + desideratum for a pragmas proposal is to make it easy (or at least + not unnecessarily hard) to write useful fallbacks. + +* It should ideally be possible to specify pragmas as annotations + applying to a symbol, a rule, or a grammar as a whole, and it should + be possible to know which is which. It is not required that the + distinction be a syntactic one, however, since it can also be + expressed by the semantics of the particular pragma. + +* It should ideally be possible for processors to generate the XML + representation of an ixml grammar containing pragmas, even if they + do not understand the pragmas contained. And conversely it should + ideally be possible for processors to write out the ixml form of an + XML grammar containing pragmas, even if the processor does not + understand the pragmas appearing in the grammar. + +## Design questions + +Several design questions can be distinguished; they are not completely +orthogonal. + +* What information should be encodable with pragmas? + +* What syntax should pragmas have in invisible XML? + +* What representation should pragmas have in the XML form of a +grammar? + +* Where can pragmas appear? + + +## Pragma Proposal(s) + +The current proposal for pragmas is given the arbitrary name of +'brackets QName' for discussion; an earlier proposal (the 'hash-QName' +proposal) has been withdrawn, though traces of it may remain in other +documents in this branch. + +### The brackets-QName proposal + +In working out the details of the brackets-QName proposal it has +become clear that as initially conceived it requires that ixml be +extended in various ways with mechanisms for: + +* Binding prefixes to namespaces so that QNames can be interpreted as +usual in XML and related specifications. (See the *Namespace +declarations* use case below.) + +* Serializing a nonterminal as an element or attribute whose name +is taken not from the grammar (as in ixml as currently specified) +but from the input data. (See the *Name indirection* use case below.) + +* Deciding whether to serialize a given nonterminal as an element or +as an attribute based on what is found in the data. (This may require +nothing more elaborate than what is described in the *Renaming* use +case below.) + +Some of these extensions can themselves be introduced using pragmas, +as illustrated in the *Worked examples* section below, but it is clear +that adding so much new functionality to ixml for the sake of pragmas +may feel like a heavy lift to some members of the community group. So +the discussion below describes two variants of the bracket-QName +proposal: a 'fixed-form' variant and a 'variable-form' variant, so +named for the relative fixity or variation in the XML representation +of pragmas in the two forms of the proposal. For brevity they are +often referred to as F and V respectively. Since they have a great +deal in common, they are described in parallel rather than separately. + +In both forms of this proposal, pragmas in ixml take the form of a +left square bracket, an optional mark, a QName or a 'URI-qualified +name', the pragma's data, and a right square bracket. Nested pairs of +square brackets are allowed, so pragmas can nest arbitrarily deep. + + +#### The ixml form + +In the ixml form of a grammar, pragmas can occur within whitespace in +several locations: + +* before the first rule of the grammar; these pragmas apply to the + grammar as a whole. + +* before a terminal or nonterminal symbol on the right-hand side of a + rule, before or after the mark if any; these pragmas apply to that + occurrence of the symbol. + +* on the left-hand side of a rule before the rule name, before or + after the mark if any; these pragmas apply to the rule. + +* immediately before the full stop of a rule; these pragmas apply to + the rule. + +Two locations are allowed for pragmas applying to rules, in order to +allow them to appear either first or last. This is essentially a +rhetorical choice, but an important one as it can make a difference to +readability. + +The relevant changes to the ixml grammar are these. First, in several +rules the options `(mark, S)?` and `(tmark, S)?` are replaced by new +nonterminals which provide for both marks and pragmas. In some +locations the nonterminal *S* is replaced by *SP* (space-or-pragma), +to allow pragmas to appear as described above. + + +```` + rule: annotation, name, S, ["=:"], S, -alts, (pragma, SP)?, ".". + nonterminal: annotation, name, S. + -quoted: tannotation, -string. + -encoded: tannotation, -"#", @hex, S. + inclusion: tannotation, set. + exclusion: tannotation, "~", S, set. + + -SP: (S; pragma)* + -annotation: (pragma, SP)?, (mark, SP)?. + -tannotation: (pragma, SP)?, (tmark, SP)?.. +```` + +To allow pragmas pertaining to the grammar as a whole to precede the +first rule of the grammar, the production rule for *ixml* is changed. +To ensure that pragmas in the prolog can be distinguished +syntactically from pragmas attached to the left-hand side of the first +rule, pragmas in the prolog are required to be followed by full stops. + + +```` +-SP: (S; pragma)*. +pragma: -"[", @pmark?, @pname, (S, pragma-data)?, -"]". +@pname: -QName; -UQName. +-QName: -name, ':', -name. +-UQName: 'Q{', -ns-name, '}', -name. +-ns-name: ~["{}"; '"'; "'"]* { oversimplification }. +@pmark: ["@^?"]. +pragma-data: (pragma-chars; pragma)*. +-pragma-chars: ~["[]"]*. +```` + +Note that these ixml fragments use only the marks and serialization +rules currently supported by ixml. If the variable-form proposal is +adopted, it will probably make sense to add new marks or new +serialization rules, or both. + +A simple example illustrates the core syntactic ideas: +```` + [my:pitch C#] + ^ [my:color blue] a = b, [@my:flavor vanilla] c? [my:spin ...]. +```` + +This fragment assumes that the prefix *my* is bound to some namespace, +by means not shown here (*and to be determined*). + + +#### Marks on pragmas in V + +In the variable-form proposal, the *pmark* signals which of various +XML constructs represents the pragma in the XML form of the grammar: + +* An ixml pragma marked `^` corresponds to an extension element. + +* An ixml pragma marked `@` corresponds to an extension attribute. + +* An ixml pragma marked `?` corresponds to a processing instruction. + +To ensure that the ixml grammar has an XML representation, any two +pragmas marked `@` and attached to the same construct must have +different expanded names. + +Since ixml pragmas marked `@` all correspond to attributes, the +precise location at which the pragmas appear in the ixml form of a +grammar cannot (*or:* must not; *or:* by definition does not) convey +information relevant to the meaning or processing of the pragma. A +pragma marked `@`, for example, has the same meaning whether it +appears before the first rule in the grammar or after the last rule, +or at some point between rules in the grammar. Similarly the +positions of pragmas marked `@` relative to other pragmas attached to +the same construct carry no meaningful information. + +#### Marks on pragmas in F + +In the fixed-form proposal, the *pmark* is included but has no special +meaning for a standard processor, since all pragmas have the same +representation in XML grammars. + + +#### The XML form of pragmas in F #### + +In F, all pragmas in the XML form of a grammar take the form +implicitly described by the grammar fragments shown earlier: a +`pragma` element with attributes named *pname* and *pmark* and a child +element named `pragma-data`. + +It is not signaled in the ixml grammar (because ixml has no way to say +it), but other child elements may follow the `pragma-data` element. +Their content is required to be reconstructible from the pragma data +and the fallback expression, but they may express the information in a +more convenient form. (For example, the pragma data may be a +structured expression which a conforming application will parse; the +parsed form of the pragma data may be enclosed in the pragma.) In this +way we ensure that the ixml and XML forms of a grammar contain the +same information, although the XML form of the grammar may be easier +to process by machine. + +For example, the ixml pragma `[my:pitch C#]` corresponds to the following +XML pragma + +```` + + C# + ... + +```` + +The ellipsis shows where additional elements not constrained by this +proposal may appear. The only constraint is that it must be possible +in principle to construct them from the ixml form of the grammar. + +It follows from the grammar fragments above that in an XML grammar, +pragmas may occur in different locations which annotate different +parts of the grammar.: + +* as a child of the `ixml` element before, between, or + after `rule` elements. These correspond to ixml pragmas occurring + before, between, or after rule elements. + +* as a child of the `rule` element, either before all `alt` children + of the rule or after them. A pragma occurring before the `alt` + children corresponds to an ixml pragma occurring on the left-hand + side of a rule; a pragma occurring after the last `alt` child + corresponds to an ixml pragma appearing in the whitespace before the + full stop of the rule. + +* as a child of a `nonterminal`, `literal`, `inclusion`, or +`exclusion` element. These correspond to ixml pragmas occurring + immediately before the terminal or nonterminal symbol in question, + before or after the mark. + + +#### The XML form of pragmas in V + +In the variable-form proposal, XML pragmas may occur in the same +locations when they take the form of elements or processing +instructions. They also occur as attributes on the parent element. + +When a grammar in XML form is written out into ixml form, extension +attributes appearing on the `ixml` element may be serialized either +before the first rule, after the last one, or between any two rules. +Attributes appearing on the `rule` element may be serialized either on +the left-hand side of the rule or before the full stop. Pragmas +attached to a symbol in the right-hand side of a rule may be +serialized either before or after the mark. In all of these cases, the +possible positions are all equivalent. + +For attributes, the attribute name is the QName of the ixml pragma and +the attribute value is the pragma data. + +For processing instructions, the PI name is the QName of the +ixml pragma and the PI value is the pragma data. + +For extension elements, the element name is the QName of the ixml +pragma and the pragma data appears as character data within a child +named `pragma-data`. As in proposal F, the element may contain other +XML elements with a structured representation of relevant information. + + +#### Pragmas and other extension mechanisms + +Some XML formats make the provision that any namespace-qualified +attributes and elements may occur in documents, provided their +namespace is not reserved for other purposes. For example, both XSLT +and XSD provide that namespace-qualified attributes in other namespace +may appear on elements in the core namespace, and both allow what we +might call foreign elements in other locations, although not at all +locations in a document. + +In general, the purpose of such provisions is similar to that of +pragmas, so it makes sense to ask how such foreign elements and +attributes relate to the XML pragmas described here. In particular, +when can they be rendered as pragmas in the ixml form? + +Under proposal F, pragmas in XML are always `pragma` elements; foreign +attributes and elements are not formally pragmas and the ixml spec +would under proposal F define no correspondence between them and any +ixml notation. + +Under proposal V, the situation is more complex: + +* Non-ixml namespaced attributes on the `ixml`, `rule`, `nonterminal`, + `literal`, `inclusion`, and `exclusion` elements can be recognized + as pragmas. + +* Non-ixml namespaced elements can be recognized as pragmas if they + appear in one of the specified locations. + +* Processing instructions can be recognized as pragmas if they occur + in one of the specified locations. + +Any pragma recognized can be written out in ixml notation. In the +case of attributes and processing instructions, the pragma data will +be taken from the value of the node. In the case of elements, the +pragma data will be the string value of the `pragma-data` element +appearing as a child of the element, if there is one. + +Non-ixml constructs not recognized as pragmas cannot be translated +interoperably to ixml form. + +#### Annotating symbols, rules, or grammars + +As described in the Desiderata, pragmas can apply to symbols, rules, +to a subset of rules, or to the grammar as a whole. There are several +cases. + +* Pragmas applicable to one occurrence of one symbol appear in ixml + before that symbol, either before or after the mark if any; in XML + they appear as attributes or children of the corresponding XML + element (`nonterminal`, `literal`, etc.). + +* Pragmas applicable to one rule appear in ixml either before the + left-hand side of that rule (after the mark) or before the full stop + of the rule; in XML they appear as attributes or children of the + `rule` element. + + Note that while we speak of the pragma as applying to the rule, + it may in practice apply more narrowly to the symbol on the + left-hand side of the rule, e.g. to provide a default value for + some property which may be overridden on individual occurrences + of the nonterminal, just as marks in ixml do now. + +* Pragmas applicable to a set of rules appear in ixml before the first + rule of the set and in XML before the first `rule` element of the + set. N.B. there is no syntactic method for specifying the last rule + of the set. If it matters, it must be handled semantically in the + specification of the pragma. Any software that does not understand + or implement the pragma in question may and should assume that the + pragma applies to all following rules. (It is not clear exactly + what use a parser that doesn't understand the pragma would make of + that information; we specify this only out of an abundance of + caution.) + +* Pragmas applicable to the grammar as a whole appear before the first + rule. + +F and V are the same in this regard. + +#### An example + +For example: + +```` + [my:pitch C#] + ^ [my:color blue] a = b, [@my:flavor vanilla] c? [my:spin ...]. +```` + +The corresponding XML form in proposal V is: + +```` + + C# + + + blue + + + + + + ... + + +```` + +In F, the corresponding XML form is: + +```` + + C# + + + + blue + + + + + + + ... + + +```` + +Annotations appearing between rules in the ixml correspond to nodes in +the XML appearing as children or attributes of the `ixml` element. In +the example, this is the case for the `my:pitch` pragma. + +Annotations appearing in the ixml between the mark and the nonterminal +on the left-hand side of a rule correspond in the XML to attributes or +children of `rule` elements. In the example, this is the case for the +`my:color` pragma. + +Annotations appearing immediately before an occurrence of a symbol in +the right-hand side of an ixml rule pertain to the occurrence of that +symbol and correspond to attributes or children of the corresponding +element in the XML grammar. In the example, this is the case for the +`my:flavor` pragma on the nonterminal *c*. + +Annotations appearing before the full stop at the end of the rule +pertain to the rule as a whole and correspond to extension elements +appear as the last children of a `rule` element. In the example, this +is the case for the `my:spin` pragma. + + +## Worked examples + +*This section should contain, for some or all of the use cases, fully +worked examples showing simple grammars that use the annotations in +question.* + + +### Namespace declarations. + +Using pragmas to specify that all or some elements of the XML returned +by an ixml processor should go into a specified namespace. + +For example, let's assume that we want some elements to go into one +namespace, some into another, and some into none, and that namespace +bindings should remain constant throughout the grammar (so: no +changing the default namespace in the middle of the document). + +*Note that there is a bootstrapping issue here: the proposal made in +this document requires that pragmas be identified by qualified names, +which requires some level of namespace support in ixml itself. So +there is a certain unavoidable artificiality in the approach taken in +the following discussion, of trying to support pragmas for namespace +declarations without asssuming namespace support in the base ixml.* + +We define the namespace 'http://example.com/ixml-namespaces" (*final +decision on namespace name pending*) as providing for namespace +bindings, and we adopt the convention that a prefix `ppp` is bound to +a namespace name `nnn` if a pragma of the form + +```` +[@nsd:ppp nnn] +```` + +is encountered in a context where the prefix *nsd* is bound to +"`http://example.com/ixml-namespaces`". + +To bootstrap the process, we adopt the principle that a +pragma of the form + +```` + [@ns:ns http://example.com/ixml-namespaces] +```` + +binds the prefix *ns* to that namespace. + +That is, a pragma-aware processor must be on the alert for pragmas +with the following properties: + +* In the pragma's QName, the prefix and local name are the same + NCName (non-colonized name). + +* The pragma's data is the magic namespace name + "`http://example.com/ixml-namespaces`". + +Alternatively, the prefix *ns* might be bound to the namespace +indicate using a pragma of the form + +```` + [@Q{http://example.com/ixml-namespaces}:ns http://example.com/ixml-namespaces] +```` + +When either form of the pragma is found, it is interpreted as binding +prefix *ns* (whatever prefix that might be) to the indicated +namespace. Any subsequent pragma with a QName using that prefix and a +URI as pragma data is to be interpreted as a namespace declaration in +the obvious way. + +In the XML form for grammars, the corresponding constructs are (a) a +namespace declaration binding the prefix *ns* to the given namespace +and (b) an attribute with the qualified name *ns:ns* with the value +"`http://example.com/ixml-namespaces`". + +*(Alternatively, we could follow the example of XML namespaces and +reserve some string for namespace bindings, but this example is trying +to show a way to do this that does not trample over what should be the +user's choice of names. Note that either way, at least one thing must +be specified in the spec for the bootstrapping process to work: +either a magic prefix, analogous to the* `xmlns` *prefix of the XML +Names recommendation, or a magic URI as shown here.)* + +So a grammar in which some elements are in the *x* namespace, some in +the *y* namespace, and some in no namespace at all, might be + +```` + [@nsd:nsd http://example.com/ixml-namespaces] + [@nsd:x http://example.org/NS/existential] + [@nsd:y http://example.com/NS/yoyo] + x:sentence: x:a, ' ', y:b?, '. ', c. + x:a: 'Speed'. + y:b: 'kills'. + c: 'It really does.'. +```` +The XML representation of the grammar might be (in form V): + +```` + + + + + + + + + + + + + + + + + + + + + + + + + + +```` + +In form F, the beginning of the grammar is different: + +```` + + + http://example.com/ixml-namespaces + + + http://example.com/NS/existential + + + http://example.com/NS/yoyo + + ... +```` + +An ixml parser supporting these namespace pragmas will emit +appropriate namespace bindings on the `x:sentence` element and the +prefixed names in the grammar will serialize in instances as prefixed +names in the XML, with appropriate namespace bindings. + +The fallback behavior of a parser that does not support these pragmas +will be as under the current spec, which someone wearing a +language-lawyer hat tells us is probably to emit output that lacks +necessary namespace declarations and is technically speaking +well-formed XML but not *namespace-well-formed* XML. + +This example does not define a capability for changing namespace +bindings within a document. It's an example. + + +### Renaming + +Using pragmas to specify that an element or attribute name +serializing a nonterminal should be given a name different from the +nonterminal itself. (As in Steven Pemberton's proposal for element +renaming.) + +In the grammar below, the two forms of month have different +syntaxes, so they are required to have different nonterminal names, +and so they are required to be serialized using different XML element +names. + +We define a renaming pragma which specifies the name to be used +when serializing a nonterminal as XML. A parser which does not +support the pragma will produce results in which some months +are named `month` and others `nmonth`; a parser which does +support the pragma will call them all `month`. + +```` + [@nsd:nsd http://example.com/ixml-namespaces] + [@nsd:sp + https://lists.w3.org/Archives/Public/public-ixml/2021Oct/0014.html] + + date: day, " ", month, " ", year. + day: d, d?. + month: "January"; "February"; etc. + year: d, d, d, d. + + iso: year, "-", [sp:rename month] nmonth, "-", day. + nmonth: d, d. +```` + +The namespace bindings in the example assume namespace pragmas as +described above. Since we require pragmas to be associated with +extended names, some mechanism for binding shortnames to namespaces is +required for convenience. If we wish, however, we can formulate this +example with a literal URI-qualified name. In that case, the *iso* +rule would read as follows. + +```` +iso: + year, "-", + [Q{https://lists.w3.org/Archives/Public/public-ixml/2021Oct/0014.html}:rename month] nmonth, + "-", day. +```` + +The fallback behavior of a parser that does not support these pragmas +will be to produce output using both the element name `month` and the +element name `nmonth`. + +### Name indirection + +Using pragmas to specify that an element or attribute name should be +taken not from the grammar but from the string value of a given +nonterminal. + +Consider the following grammar which recognizes a superset of +a simple subset of XML. It's a subset of XML for simplicity, and it's +a superset of the subset because a grammar written at this level +cannot enforce the well-formedness constraints of XML. +```` + { A grammar for a small subset of XML, as an illustration. } + + element: start-tag, content, end-tag; sole-tag. + + -start-tag: "<", @gi, (ws, attribute)*, ws?, ">". + -end-tag: "". + -sole-tag: "<", @gi, (ws, attribute)*, ws?, "/>". + + attribute: @name, ws?, "=", ws?, @value. + @value: dqstring; sqstring. + -dqstring: dq, ~['"']*, dq. + -sqstring: sq, ~["'"]*, sq. + -dq: -['"']. + -sq: -["'"]. + + -content: (PCDATA; processing-instruction; comment; element)*. + + PCDATA: (~["<>&"]; "&"; "<"; ">")*. + processing-instruction: "". + comment: "<--", commentdata, "-->". + + gi: name. + gi2: name. + { name is left as an exercise for the reader. } + + ws: (#20; #A; #C; #9)+. +```` + +Among the input sequences which should be accepted +by this grammar is the following XML representation of a +haiku. + +```` + + When the old pond + gets a new frog + it's a new pond. + +```` + +We might like an ixml processor to read this and produce +the same XML that any XML parser would produce. (This +desire makes sense only when the ixml processor's results +are supplied to a user in a DOM or XDM or SAX or other +XML API or model.) What the grammar above will produce +is isomorphic to this result, but not the same (*WARNING: +output produced manually, may be inaccurate*): + +```` + + + + + When the old pond + + + gets a new frog + + + It's a new pond. + + +```` + +We can use the following pragmas to obtain normal XML from +parsing with the grammar: + +* `xp:name` *expression* - specifies that the name under which a +nonterminal is to be serialized is given by the value of the supplied +XPath expression, interpreted with the standard ixml result element as +the context node and with the result coerced to type *xs:string*. + +* `xp:serialize` *keyword* - specifies that the nonterminal is to be +serialized as specified by the keyword (which is assumed to be +`attribute`, `element`, or the name of some other XPath node test). + +* `xp:drop` - specifies that the nonterminal so annotated is to be +suppressed entirely, along with the entire parse tree dominated by the +nonterminal.' + +With these pragmas, we can annotate the *element* and *attribute* +rules appropriately: +```` +^ [xp:name @gi] element: start-tag, content, end-tag; sole-tag. +... +-end-tag: "". +... +^ [xp:serialize attribute] + [xp:name @name] + attribute: @name, ws?, "=", ws?, @value. +```` + +### Rule rewriting + +Using pragmas to specify that a rule as given is shorthand for a set +of other rules. Consider the following simple grammar for +arithmetic expressions. + +```` +expr: term; expr, addop, term. +term: factor; term, mulop, factor. +factor: number; var; -'(', -expr, -')'. +... +```` + +We might find it inconvenient that the number 42 is represented as +```` + + + + 42 + + + +```` + +One simple rule to simplify the XML representation of sentences in +this language is to specify that if an element *E* has only one child, +*E* should not be tagged and only the child should appear in the XML. + +We can do this in ixml by expanding the grammar, splitting each +nonterminal into two rules, one producing a visible serialization and +one hiding the nonterminal on serialization. + +```` +-EXPR: TERM; expr. +expr: EXPR, addop, TERM. +-TERM: FACTOR; term. +term: TERM, mulop, FACTOR. +-FACTOR: number; var; -'(', EXPR, -')'. +... +```` + +Now 42 parses more simply as `42`. + +The rewrite is mechanical enough that we can automate it, and +error-prone enough that it may be worth automating. If a rule has +some right-hand sides guaranteed to produce at most one child each and +some guaranteed to produce at least two children each, it's split into +two rules. The first gets a new nonterminal and has the original +single-child right-hand sides as alternatives, as well as a reference +to the original nonterminal. It's marked hidden. The second rule +gets the original nonterminal. All references to the original +nonterminal are changed to be references to the new nonterminal. + +If we call the relevant pragma *rewrite:no-unit-rules*, or more +briefly *r:nur*, the grammar takes the following form. Note that in +order to ensure that the first pragma is correctly interpreted as +belonging to the first rule and not to the grammar as a whole, we must +specify an explicit mark for the first rule. We specify one for the +second rule as well just for visual parallelism. In practice, we +also need a rule that means "don't rewrite the entire rule, but +replace references to rules rewritten using *r:nur*; we call this +second pragma *r:ref*. + +```` +^ [r:nur] expr: term; expr, addop, term. +^ [r:nur] term: factor; term, mulop, factor. +- [r:ref] factor: number; var; -'(', -expr, -')'. +... +```` + +The XML representation of this grammar can plausibly exploit the +ability of extension elements to contain an XML representation of the +new rules. Both the `r:nur` and the `r:ref` elements within a rule +instruct the implementation to replace the enclosing rule with the +rules appearing as children of the extension elements. + +```` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ... + +```` + +The attentive reader will note that the XML form shown is that for the +V proposal; the form it would take in the F proposal should be easily +constructed. + +The fallback behavior of a processor that doesn't support these +pragmas will be to serialize `expr` and `term` elements even when they +have only one child. + + +### Tokenization annotation and alternative formulations + +We can use pragmas to annotate nonterminals in an ixml grammar to +indicate that they define a regular language and can be safely +recognized by a greedy regular-expression match. + +For example, consider the grammar for a simple programming language. +A processor might read programs a little faster if it could read +identifiers in a single operation; this will be true if when an +identifier is encountered, the identifier will always consist of the +longest available sequence of characters legal in an identifier. In +the toy Program.ixml grammar, the rule for identifiers is: + +```` + identifier: letter+, S. +```` + +We can annotate *identifier* to signal that it's safe to consume an +identifier using a single regular-expression match by using a pragma +in a 'lexical scanning' (ls) namespace: + +```` + [ls:token] identifier: letter+, S. +```` + +The rules for comments in ixml itself offer another wrinkle. + +```` + comment: -"{", (cchar; comment)*, -"}". + -cchar: ~["{}"]. +```` + +Within a comment, any sequence of characters matching *cchar* can be +recognized in a single operation; there is no need to look for +alternate parses that consume only some of the characters. But there +is no nonterminal here that matches all and only non-empty sequences +of *cchar*. In order to use the *ls:token* annotation here, we must +first rewrite the grammar at this point. So we introduce an +annotation named *ls:rewrite* to be attached to a single grammar rule +with the meaning that the pragma data provide an alternate form of the +rule. + +We can now annotate the grammar and supply an alternative formulation of +*comment* that replaces it with two new rules: + +```` + ^ [ls:rewrite + comment: -"{", (cchars; comment)*, -"}". + [ls:token] -cchars: cchar+. + ] + comment: -"{", (cchar; comment)*, -"}". + -cchar: ~["{}"]. +```` + +Or we may find it easier to read if we inject the alternative +formulation after, not before, the existing rule: + +```` + comment: -"{", (cchar; comment)*, -"}" + [ls:rewrite + comment: -"{", (cchars; comment)*, -"}". + - [ls:token] cchars: cchar+. + ]. + -cchar: ~["{}"]. +```` + +Either way, the rewrite contains an alternative formulation of the +grammar which recognizes the same sentences and provides the same XML +representation but may be processed faster by some processors. + +The fallback behavior of a processor that doesn't support these +pragmas will be to parse as usual using the grammar as specified. + +Note however that there is no guarantee or requirement that the +alternate rules in an *ls:rewrite* pragma be equivalent to the +fallback rules: pragmas may change the behavior of a processor, and +they may change the meaning of an expression (or here the meaning of a +grammar or part of it). + + +### Text injection + +Using pragmas to indicate that a particular string should be +injected into the XML representation of the input as (part of) a text +node, or as an attribute or element. (This can help make the output of +an ixml parse conform to a pre-existing schema.) + +### Attribute grammar specification + +*Example: synthesized value attribute for arithmetic expressions.* + +Consider the following simple grammar for arithmetic expressions +involving addition and multiplication over single-digit integers. + +```` + expr: expr, s, '+', s, term. + term: term, s, '*', s, factor. + factor: '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9'; + -'(', s, expr, s, -')'. + s: [#20; #A; #D; #9]*. +```` + +In an attribute-grammar system, we might define the *value* of an +expression as a synthesized (bottom-up) grammatical attribute +following the rules: + +* The value of a *factor* consisting of a single digit is the value of +the integer usually so written: '0' has the value of zero, '1' has the +value of one, etc. + +* The value of a *factor* consisting of a parenthesized *expr* is the +value of the *expr*. + +* The value of a *term* consisting solely of a *factor* is the value +of the *factor*. + +* The value of a *term* consisting of a *term* followed by an +asterisk and a *factor* is the product of the values of the *term* and +the *factor*. + +* The value of an *expr* consisting solely of a *term* is the value +of the *term*. + +* The value of an *expr* consisting of an *expr* followed by a +plus sign and a *term* is the sum of the values of the *expr* and +the *term*. + +Extending this to handle subtraction, division, and multiple-digit +numbers would be straightforward but require a lot more rules which +would not involve any interesting new principles. + +A conventional system for reading attribute grammars and making +parsers which parse input and calculate the values of grammatical +attributes might represent this grammar as follows. We name the +grammatical attribute *v*. (This example follows roughly the syntax of +Alblas 1991, and like Alblas assumes whitespace is someone else's +problem.) + +```` + expr_0 → expr_1 '+' term. + [ expr_0.v = expr_1.v + term.v] + + term_0 → term_1 '*' factor. + [ term_0.v = term_1.v * factor.v] + + factor → '0'. + [ factor.v = 0 ] + factor → '1'. + [ factor.v = 1 ] + factor → '2'. + [ factor.v = 2 ] + factor → '3'. + [ factor.v = 3 ] + factor → '4'. + [ factor.v = 4 ] + factor → '5'. + [ factor.v = 5 ] + factor → '6'. + [ factor.v = 6 ] + factor → '7'. + [ factor.v = 7 ] + factor → '8'. + [ factor.v = 8 ] + factor → '9'. + [ factor.v = 9 ] + factor → '(' expr ')'. + [ factor.v = expr.v ] +```` + +Note that some nonterminals are subscripted so that references to +their grammatical attributes can be unambiguous. To express this +grammar in ixml, we need either (1) to allow multiple rules for the +same nonterminal, or (2) to allow pragmas before connectors like comma +or semicolon, or (3) we need to allow string-to-typed-value functions +in the style of XPath. I'll assume the latter two, along with a +string() function that returns the string value of a nonterminal. With +these assumptions, and the assumption that by means not specified the +prefix *ag* has been bound to an appropriate namespace for +attribute-grammar functionality, the attribute grammar could be +written thus using the brackets-QName syntax: + +```` + [@ag:id e0] expr: [@ag:id e1] expr, s, '+', s, term + [ag:rule e0.v := e1.v + term.v ]. + [@ag:id t0] term: [@ag:id t1] term, s, '_', s, factor + [ag:rule t0.v := t1.v * factor.v]. + factor: digit [@ag:rule factor.v := number(string(digit))]; + '(', s, expr, s, ')' [@ag:rule factor.v := expr.v ]. + digit: '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9'. + s: [#20; #A; #D; #9]. +```` + +Here *ag:id* is assumed to associate a unique identifier with a +particular instance of a nonterminal, and *ag:rule* is assumed to +contain a set of assignment statements specifying the values of +particular attributes, in a subset of XPath syntax. (A more serious +proposal would need some way to distinguish *e0.v* meaning "the *v* +attribute of *e0*" from *e0.v* occurring as a name which happens to +contain a dot. This is not that serious proposal.) + + +*Example: synthesized value attribute and inherited environment + attribute with variable bindings, for arithmetic expressions with + 'let'.* + +[Left as an exercise for the reader.] + +### Pragmas for proposal V + +In the ideal case, it should be possible to use pragmas defined +as in proposal F to describe the behavior of processors under +proposal V. + +*Example to be worked out with list of the required pragmas +and annotated version of the ixml grammar fragments given +earlier.* + +The differences between proposals F and V all lie in the +XML representation of pragmas: + +* In F, the element and attribute names of the pragma element + are fixed; in V the element name is assigned dynamically. + +* In F, all pragmas are elements; in V, some are attributes and some + are processing instructions. + +* In F, the *pname* and *pmark* appear as attributes on the + pragma; in V they affect the name and form of the XML pragma + but do not appear as content, attributes, or children. + +The section above on name indirection illustrates ways to solve the +first and third of these. We can adapt the pragmas shown there and +add a third for the remaining item, so that we have three pragmas, +which we define in terms of their operation on an XML representation +of the raw parse tree (i.e. the one an ixml processor would produce if +it ignored all marks); a processor might implement the pragma +differently, as long as the result were the same. + +The first two of these pragmas are defined to apply to a rule and they +describe the relation of the serialized XML for instances of that rule +to the raw parse tree for that instance. + +* `pr:as` *pmark*?: If the *pmark* argument is `^` or absent, the + nonterminal is serialized as an element; if it's `@`, the + nonterminal is serialized as an attribute; if it's `?`, the + nonterminal is serialized as a processing instruction. + +* `pr:name` *pname*: The *pname* is specified by an expression (we + will assume a small subset of XPath). If the value is a lexical + QName, that QName is the name of the corresponding XML node; if the + value is a URIQualifiedName (of the form + Q{namespace-name}local-name), then a prefix is chosen, a lexical + QName is formed from that prefix and the specified local name, and + any namespace declarations necessary to bind the chosen prefix to + the specified URI are created and added to an appropriate element. + +The third pragma applies to nonterminals in a right-hand side. + +* `pr:drop`: The parse tree rooted in the nonterminal annotated with + `pr:drop` does not appear in the serialization. Since that subtree + may be used (e.g. to supply an element name), it is important as a + practical matter that this pragma be interpreted after the others. + +With these three pragmas, the ixml rule for *pragma* can be annotated +as follows: + +```` + ^ [pr:as string(pmark)] + [pr:name string(pname)] + pragma: "[", [pr:drop] @pmark?, [pr:drop] @pname, (S, pragma-data)?, "]". +```` + +Alternatively, we could define a single pragma with a sequence of +comma-separated property/value pairs, using parenthesized +comma-separated values to specify multiple values: + +```` + ^ [pr:v name: string(pname), + as: string(pmark), + drop: (pmark, pname)] + pragma: "[", @pmark?, @pname, (S, pragma-data)?, "]". +```` + +Note: The `pr:v` (for 'pragmas proposal V') pragma seems less general +than the earlier set, but it feels slightly lighter-weight. + +Note: The `pr:as` pragma is very ad hoc; a more general approach would +say that its argument should be one of 'element', 'attribute', +'processing-instruction', 'comment', 'text'. But then we would need +some way of saying "if the string value of *pmark* is '^', then +'element', otherwise ...", and that seems likely to lead to a long +slide down a slippery slope to a Turing-complete programming +language. One of the nice things about defining your own pragmas is +that you can give them ad hoc semantics if you need to, without +spoiling things for other people. + + +## Namespace binding proposals + +As described in the worked example for *Namespace declarations* above, +there are at least two ways for ixml to use pragmas provide the +namespace declaration functionality necessary to allow QNames to be +used in pragmas. We call them U (for 'user-specified namespace +binding prefix') and S (for 'specification-defined namespace binding +prefix'). + +The two proposals differ in how pragmas are recognized as +namespace-binding pragmas but are otherwise similar. + +### Namespace binding, common rules + +In both proposal, the ixml spec requires that conforming processors +understand the syntax for pragmas (this is implicit in the rule that +they follow the syntax for ixml grammars) and that conforming +processors understand and implement namespace-binding pragmas which +work as follows: + +* A namespace-binding pragma whose QName has the local name *n* binds + *n* as a namespace prefix to the namespace whose name by the pragma + data. + + As is the case for for XML namespaces generally, the pragma data + should be a legal URI, but ixml processors are not obligated to + check the URI for syntactic correctness (although they are may do + so), and normally should not attempt to dereference it. + +* All namespace-binding pragmas pertain to the grammar as a whole and + must be given before the first rule of the grammar. + +* A nonterminal taking the lexical form of a QName must if serialized + be serialized as an XML element name with the same local name and + with a prefix bound to the same namespace. Normally the prefix + should be as given in the grammar. *(If all namespaces are declared + before the first rule, there should be no reason it should be + impossible to use the same prefix. Perhaps we can make this a + 'must'.)* + + The ixml processor is responsible for including appropriate + namespace declarations in the XML output. + +* In the XML form of an ixml grammar, all namespaces bound in in the + ixml grammar should be bound in the XML form of the grammar. + + This should normally take the form of namespace declarations on the + `ixml` element. The pragmas should also be represented in the usual + way, if that differs from being realized as a namespace-binding + attribute. + +The proposals differ in their rules for how a pragma is recognized as +a namespace-binding pragma. + + +### Namespace binding in proposal U + +In this proposal, namespace-binding pragmas are those whose QName is +in a particular well-known namespace; a bootstrapping rule is used to +recognize the binding of a user-specified prefix to that namespace. + +* A pragma whose QName has the same name as its prefix and its local + name and whose value is the namespace-binding namespace + '`http://example.com/ixml-namespaces`" binds the given prefix to + that namespace. This pragma must be marked `@`. For example: + + ```` + [@nsd:nsd http://example.com/ixml-namespaces] + ```` + +* The equivalent pragma using a URI-qualified name similarly binds the + given prefix to that namespace. + + For example: + + ```` + [@Q{http://example.com/ixml-namespaces}:nsd + http://example.com/ixml-namespaces] + ```` + +In the examples just given, the prefix *nsd* is only an example: the +user can specify any desired prefix to use for namespace bindings. + +* Any pragma whose QName has the namespace + "`http://example.com/ixml-namespaces`" is recognized as a + namespace-binding pragma. + +* Any pragma with an unprefixed name whose local name is bound as a + prefix to the namespace "`http://example.com/ixml-namespaces`" is + recognized as a namespace-binding pragma defining a default + namespace. + + +### Namespace binding in proposal S + +In this proposal, namespace-binding pragmas are recognized by their +use of a reserved prefix in their QName. + +* The prefix *ixmlns* is understood by all conforming ixml software as + bound to the namespace-binding namespace + '`http://example.com/ixml-namespaces`". + + *Open question: use the name 'ixmlns' or some other name? Perhaps + 'xmlns'?* + +* Any pragma whose QName has the prefix *ixmlns* is recognized as a + namespace-binding pragma. + +* Any pragma with the unprefixed *ixmlns* is recognized as a + namespace-binding pragma defining a default namespace. + + + +## Open issues + +* The fact that extension elements can contain things that are + implicit but not explicit in the ixml form means that a schema for + the visible-XML form of a grammar, as described here, requires + manual intervention and not just a mechanical derivation from the + ixml grammar for ixml. That will make some people nervous, as it + makes us. + +* Allow pragmas between `alt` elements / immediately before the + separator between top-level alternatives in a right-hand side? + + Con: things are complicated enough as it is. Pro: it would allow + pragmas to support the attribute-grammar use case. It would make + occurrence before, after, or between `alt` elements within a `rule` + parallel to occurrence before, after, or between `rule` elements + within the `ixml` element. + + *No. Make the attribute-grammar example work some other way.* + +* How should the prolog be defined? Several formulations have + occurred to me, some equivalent and some not. Which is clearest and + nicest? + + * Inline the full stop: + + `-prolog: S, pragma*(S, (-'.', S)), S.` + + * Require only one full stop, at the end of the prolog: + + `-prolog: S, pragma*S, S, -'.', S.` + + * Use a different name for pragmas with full stops, to simplify the + rule for *prolog:* + + ```` + -prolog: S, Pragma*S, S. + -Pragma: pragma, S, '.'. + ```` + + * Use the nonterminal *ppragma* rather than *Pragma:* + + ```` + -prolog: S, ppragma*S, S. + -ppragma: pragma, S, '.'. + ```` + +## Decisions to be made by the group + +* If the proposal is adopted, which form of the pragmas proposal + should be chosen? V or F? Or some other variant? + +* If the proposal is adopted, which form of the namespace binding + proposal should be chosen? U or S? Or some other variant? + +* If proposal S is adopted, what name should be used for the magic + namespace-binding prefix in proposal S? The proposal above uses + *ixmlns*; it might feel more convenient if we use `xmlns`. If we do + so, are we violating the rule that says names beginning with 'xml' + are reserved for W3C Recommendations? + +* What name should be used for the magic namespace-binding namespace? + In the examples, we use "`http://example.com/ixml-namespaces`". + + Should we use "`http://www.w3.org/2000/xmlns/`", to which the prefix + `xmlns` is bound by convention? + +* The group may also wish to weigh in on any of the open issues listed + above, if any are left when this document goes to the group. + +* *I could have sworn there were more things to put here.* + +* Once the group has resolved the questions just listed, the remaining + question is: should the proposal as thus refined by adopted for + ixml 1.0 or not? + + +## References + +Alblas 1991. Henk Alblas, "Introduction to attribute grammars," in +*Attribute grammars, applications and systems:* *International summer +school SAGA, Prague, Czechoslovakia, June 4-13, 1991, Proceedings*, +ed. H. Alblas and B. Melichar (Berlin et al: Springer, 1991) = LNCS +545, pp. 1-15. diff --git a/misc/temp.xy.ixml b/misc/temp.xy.ixml new file mode 100644 index 00000000..c7613e5e --- /dev/null +++ b/misc/temp.xy.ixml @@ -0,0 +1,4 @@ +x--sentence: x--a, ' ', y--b?, '. ', c. +x--a: 'Speed'. +y--b: 'kills'. +c: 'It really does.'. diff --git a/samples/ABNF/ABNF.ixml b/samples/ABNF/ABNF.ixml new file mode 100644 index 00000000..d8830819 --- /dev/null +++ b/samples/ABNF/ABNF.ixml @@ -0,0 +1,127 @@ +{ The grammar notation defined by RFC 5234, "Augmented BNF for Syntax +Specifications: ABNF", ed. D. Crocker and P. Overell, January 2008. + +The nonterminals used here are those of RFC 5234, but some definitions +have been reformulated to use ixml idioms. The definition of ABNF has +no analogue to ixml marks for guiding XML serialization; the marks +used here have been supplied by the transcriber. + +Transcribed into ixml by C. M. Sperberg-McQueen, February 2022. } + +rulelist = (rule | (c-wsp*, c-nl))+. + +rule = rulename, defined-as, elements, c-nl. + { continues if next line starts + with white space } + +rulename = ALPHA, (ALPHA | DIGIT | "-")*. + +defined-as = c-wsp*, ("=" | "=/"), c-wsp*. + { basic rules definition and + incremental alternatives } + +elements = alternation, c-wsp*. + +c-wsp = WSP | (c-nl, WSP). + +c-nl = comment | CRLF. { comment or newline } + +comment = ";", (WSP | VCHAR)* CRLF. + +alternation = concatenation * (c-wsp*, "/", c-wsp). + +concatenation = repetition * (c-wsp+). + +repetition = repeat?, element. + +repeat = DIGIT+ | (DIGIT*, "*", DIGIT*). + +element = rulename | group | option + | char-val | num-val | prose-val. + +group = "(", c-wsp*, alternation, c-wsp*, ")". + +option = "[", c-wsp*, alternation, c-wsp*, "]". + +char-val = DQUOTE, [#20 - #21; #23 - #7E]*, DQUOTE. + { quoted string of SP and VCHAR + without DQUOTE } + +num-val = "%", (bin-val | dec-val | hex-val). + +bin-val = "b", BIT+, ((".", bit+)+ | ("-", BIT+))?. + { series of concatenated bit values + or single ONEOF range } + +dec-val = "d", DIGIT+, ((".", DIGIT+)+ | ("-", DIGIT+))?. + +hex-val = "x", HEXDIG+, ((".", HEXDIG+)+ | ("-", HEXDIG+))?. + +prose-val = "<", [#20 - #3D; #3F - #7E]*, ">". + { bracketed string of SP and VCHAR + without angles + prose description, to be used as + last resort } + +{ 'Core rules' from Appendix B, intended for re-use. } + +ALPHA = ["A"-"Z"; "a"-"z"]. { #41-#5A; #61-#7A } + +BIT = "0"; "1". + +CR = #0D. { carriage return } + +CRLF = CR, LF. { Internet standard newline } + +DIGIT = ["0"-"9"]. { #30 - #39 } + +DQUOTE = #22. { (Double Quote) } + +HEXDIG = DIGIT; ["A"-"F"]. + +HTAB = #09. { horizontal tab } + +LF = #0A. { linefeed } + +SP = #20. + +VCHAR = [#21 - #7E]. { visible (printing) characters } + +WSP = SP | HTAB. { white space } + +{ Included in Appendix B for reuse elsewhere, but not used +by ABNF itself: } + +{ + +CHAR = [#01 - #7F]. + { Any 7-bit US-ASCII chracter, + excluding NUL } + +CTL = [#00 - #1F; #7F]. { controls } + +LWSP = (WSP | CRLF, WSP)*. + { Use of this linear-white-space rule + permits lines containing only white + space that are no longer legal in + mail headers and have caused + interoperability problems in other + contexts. + Do not use when defining mail + headers and use with caution in + other contexts. } + +OCTET = [#00 - #FF]. { 8 bits of data } + +} + + +{ Notes: + +- As noted in the comments, this grammar has been reported ambiguous + in a couple of places. + +- ABNF nonterminals are case-insensitive unless specified using + numeric values for the characters. + +} diff --git a/samples/ISBN/ISBN.ixml b/samples/ISBN/ISBN.ixml new file mode 100644 index 00000000..6c24e948 --- /dev/null +++ b/samples/ISBN/ISBN.ixml @@ -0,0 +1,268 @@ +{ A grammar for ISBN-13, ISBN-10, and ISSN book and serial numbers. + Effectively we use the nonterminals to keep track of the calculation + of the check digit. Each nonterminal of the form dX_Y handles + the digit in position X, in the case that the running total of + the check digit is Y. + + This grammar does not attempt to enforce correct use of hyphens, + which depends on the boundaries between the GS1 prefix (if any), the + registration group element, the registrant element, the publication + element, and the check digit, which in turn depend on their values. + This grammar simply allows hyphens anywhere except in the middle of + the GS1 prefix. + +} + +{ The current version is a place-holder; it only handles isbn-13. } + +{ isbn-13: A thirteen-digit ISBN } + +{ In a thirteen-digit ISBN, the check digit is the sum of the first, + third, and other odd-numbered digits (up to digit 11), plus three + times the sum of the even-numbered digits, modulo 10. Because of + the way modular arithmetic works, we only need to keep track of the + ones digit, which allows us to get by with ten nonterminals for each + position in the ISBN, starting with the fourth. The first three are + the GS1 prefixes '978' and '979', which can be handled specially. } + +isbn-13 = '978', d4_8; '979', d4_9. + +{ d4: fourth digit of the ISBN. } + +{ d4_8: fourth digit, current running sum is 8. Multiply the fourth + digit by 3, add running sum, take the modulus } + +-d4_8 = '-'?, + ( '0', d5_8 + | '1', d5_1 + | '2', d5_4 + | '3', d5_7 + | '4', d5_0 + | '5', d5_3 + | '6', d5_6 + | '7', d5_9 + | '8', d5_2 + | '9', d5_5 + ). + +{ d4_9: fourth digit, current running sum is 9. } + +-d4_9 = '-'?, + ( '0', d5_9 + | '1', d5_2 + | '2', d5_5 + | '3', d5_8 + | '4', d5_1 + | '5', d5_4 + | '6', d5_7 + | '7', d5_0 + | '8', d5_3 + | '9', d5_6 + ). + + +{ d5_x: fifth digit, x is running sum, add 1 * digit } + +-d5_0 = '-'?, ('0', d6_0 | '1', d6_1 | '2', d6_2 | '3', d6_3 | '4', d6_4 + | '5', d6_5 | '6', d6_6 | '7', d6_7 | '8', d6_8 | '9', d6_9). +-d5_1 = '-'?, ('0', d6_1 | '1', d6_2 | '2', d6_3 | '3', d6_4 | '4', d6_5 + | '5', d6_6 | '6', d6_7 | '7', d6_8 | '8', d6_9 | '9', d6_0). +-d5_2 = '-'?, ('0', d6_2 | '1', d6_3 | '2', d6_4 | '3', d6_5 | '4', d6_6 + | '5', d6_7 | '6', d6_8 | '7', d6_9 | '8', d6_0 | '9', d6_1). +-d5_3 = '-'?, ('0', d6_3 | '1', d6_4 | '2', d6_5 | '3', d6_6 | '4', d6_7 + | '5', d6_8 | '6', d6_9 | '7', d6_0 | '8', d6_1 | '9', d6_2). +-d5_4 = '-'?, ('0', d6_4 | '1', d6_5 | '2', d6_6 | '3', d6_7 | '4', d6_8 + | '5', d6_9 | '6', d6_0 | '7', d6_1 | '8', d6_2 | '9', d6_3). +-d5_5 = '-'?, ('0', d6_5 | '1', d6_6 | '2', d6_7 | '3', d6_8 | '4', d6_9 + | '5', d6_0 | '6', d6_1 | '7', d6_2 | '8', d6_3 | '9', d6_4). +-d5_6 = '-'?, ('0', d6_6 | '1', d6_7 | '2', d6_8 | '3', d6_9 | '4', d6_0 + | '5', d6_1 | '6', d6_2 | '7', d6_3 | '8', d6_4 | '9', d6_5). +-d5_7 = '-'?, ('0', d6_7 | '1', d6_8 | '2', d6_9 | '3', d6_0 | '4', d6_1 + | '5', d6_2 | '6', d6_3 | '7', d6_4 | '8', d6_5 | '9', d6_6). +-d5_8 = '-'?, ('0', d6_8 | '1', d6_9 | '2', d6_0 | '3', d6_1 | '4', d6_2 + | '5', d6_3 | '6', d6_4 | '7', d6_5 | '8', d6_6 | '9', d6_7). +-d5_9 = '-'?, ('0', d6_9 | '1', d6_0 | '2', d6_1 | '3', d6_2 | '4', d6_3 + | '5', d6_4 | '6', d6_5 | '7', d6_6 | '8', d6_7 | '9', d6_8). + + +{ d6_x: sixth digit, x is running sum, add 3 * digit } + +-d6_0 = '-'?, ('0', d7_0 | '1', d7_3 | '2', d7_6 | '3', d7_9 | '4', d7_2 + | '5', d7_5 | '6', d7_8 | '7', d7_1 | '8', d7_4 | '9', d7_7). +-d6_1 = '-'?, ('0', d7_1 | '1', d7_4 | '2', d7_7 | '3', d7_0 | '4', d7_3 + | '5', d7_6 | '6', d7_9 | '7', d7_2 | '8', d7_5 | '9', d7_8). +-d6_2 = '-'?, ('0', d7_2 | '1', d7_5 | '2', d7_8 | '3', d7_1 | '4', d7_4 + | '5', d7_7 | '6', d7_0 | '7', d7_3 | '8', d7_6 | '9', d7_9). +-d6_3 = '-'?, ('0', d7_3 | '1', d7_6 | '2', d7_9 | '3', d7_2 | '4', d7_5 + | '5', d7_8 | '6', d7_1 | '7', d7_4 | '8', d7_7 | '9', d7_0). +-d6_4 = '-'?, ('0', d7_4 | '1', d7_7 | '2', d7_0 | '3', d7_3 | '4', d7_6 + | '5', d7_9 | '6', d7_2 | '7', d7_5 | '8', d7_8 | '9', d7_1). +-d6_5 = '-'?, ('0', d7_5 | '1', d7_8 | '2', d7_1 | '3', d7_4 | '4', d7_7 + | '5', d7_0 | '6', d7_3 | '7', d7_6 | '8', d7_9 | '9', d7_2). +-d6_6 = '-'?, ('0', d7_6 | '1', d7_9 | '2', d7_2 | '3', d7_5 | '4', d7_8 + | '5', d7_1 | '6', d7_4 | '7', d7_7 | '8', d7_0 | '9', d7_3). +-d6_7 = '-'?, ('0', d7_7 | '1', d7_0 | '2', d7_3 | '3', d7_6 | '4', d7_9 + | '5', d7_2 | '6', d7_5 | '7', d7_8 | '8', d7_1 | '9', d7_4). +-d6_8 = '-'?, ('0', d7_8 | '1', d7_1 | '2', d7_4 | '3', d7_7 | '4', d7_0 + | '5', d7_3 | '6', d7_6 | '7', d7_9 | '8', d7_2 | '9', d7_5). +-d6_9 = '-'?, ('0', d7_9 | '1', d7_2 | '2', d7_5 | '3', d7_8 | '4', d7_1 + | '5', d7_4 | '6', d7_7 | '7', d7_0 | '8', d7_3 | '9', d7_6). + + +{ d7_x: seventh digit, x is running sum, add 1 * digit } + +-d7_0 = '-'?, ('0', d8_0 | '1', d8_1 | '2', d8_2 | '3', d8_3 | '4', d8_4 + | '5', d8_5 | '6', d8_6 | '7', d8_7 | '8', d8_8 | '9', d8_9). +-d7_1 = '-'?, ('0', d8_1 | '1', d8_2 | '2', d8_3 | '3', d8_4 | '4', d8_5 + | '5', d8_6 | '6', d8_7 | '7', d8_8 | '8', d8_9 | '9', d8_0). +-d7_2 = '-'?, ('0', d8_2 | '1', d8_3 | '2', d8_4 | '3', d8_5 | '4', d8_6 + | '5', d8_7 | '6', d8_8 | '7', d8_9 | '8', d8_0 | '9', d8_1). +-d7_3 = '-'?, ('0', d8_3 | '1', d8_4 | '2', d8_5 | '3', d8_6 | '4', d8_7 + | '5', d8_8 | '6', d8_9 | '7', d8_0 | '8', d8_1 | '9', d8_2). +-d7_4 = '-'?, ('0', d8_4 | '1', d8_5 | '2', d8_6 | '3', d8_7 | '4', d8_8 + | '5', d8_9 | '6', d8_0 | '7', d8_1 | '8', d8_2 | '9', d8_3). +-d7_5 = '-'?, ('0', d8_5 | '1', d8_6 | '2', d8_7 | '3', d8_8 | '4', d8_9 + | '5', d8_0 | '6', d8_1 | '7', d8_2 | '8', d8_3 | '9', d8_4). +-d7_6 = '-'?, ('0', d8_6 | '1', d8_7 | '2', d8_8 | '3', d8_9 | '4', d8_0 + | '5', d8_1 | '6', d8_2 | '7', d8_3 | '8', d8_4 | '9', d8_5). +-d7_7 = '-'?, ('0', d8_7 | '1', d8_8 | '2', d8_9 | '3', d8_0 | '4', d8_1 + | '5', d8_2 | '6', d8_3 | '7', d8_4 | '8', d8_5 | '9', d8_6). +-d7_8 = '-'?, ('0', d8_8 | '1', d8_9 | '2', d8_0 | '3', d8_1 | '4', d8_2 + | '5', d8_3 | '6', d8_4 | '7', d8_5 | '8', d8_6 | '9', d8_7). +-d7_9 = '-'?, ('0', d8_9 | '1', d8_0 | '2', d8_1 | '3', d8_2 | '4', d8_3 + | '5', d8_4 | '6', d8_5 | '7', d8_6 | '8', d8_7 | '9', d8_8). + + +{ d8_x: eigth digit, x is running sum, add 3 * digit } + +-d8_0 = '-'?, ('0', d9_0 | '1', d9_3 | '2', d9_6 | '3', d9_9 | '4', d9_2 + | '5', d9_5 | '6', d9_8 | '7', d9_1 | '8', d9_4 | '9', d9_7). +-d8_1 = '-'?, ('0', d9_1 | '1', d9_4 | '2', d9_7 | '3', d9_0 | '4', d9_3 + | '5', d9_6 | '6', d9_9 | '7', d9_2 | '8', d9_5 | '9', d9_8). +-d8_2 = '-'?, ('0', d9_2 | '1', d9_5 | '2', d9_8 | '3', d9_1 | '4', d9_4 + | '5', d9_7 | '6', d9_0 | '7', d9_3 | '8', d9_6 | '9', d9_9). +-d8_3 = '-'?, ('0', d9_3 | '1', d9_6 | '2', d9_9 | '3', d9_2 | '4', d9_5 + | '5', d9_8 | '6', d9_1 | '7', d9_4 | '8', d9_7 | '9', d9_0). +-d8_4 = '-'?, ('0', d9_4 | '1', d9_7 | '2', d9_0 | '3', d9_3 | '4', d9_6 + | '5', d9_9 | '6', d9_2 | '7', d9_5 | '8', d9_8 | '9', d9_1). +-d8_5 = '-'?, ('0', d9_5 | '1', d9_8 | '2', d9_1 | '3', d9_4 | '4', d9_7 + | '5', d9_0 | '6', d9_3 | '7', d9_6 | '8', d9_9 | '9', d9_2). +-d8_6 = '-'?, ('0', d9_6 | '1', d9_9 | '2', d9_2 | '3', d9_5 | '4', d9_8 + | '5', d9_1 | '6', d9_4 | '7', d9_7 | '8', d9_0 | '9', d9_3). +-d8_7 = '-'?, ('0', d9_7 | '1', d9_0 | '2', d9_3 | '3', d9_6 | '4', d9_9 + | '5', d9_2 | '6', d9_5 | '7', d9_8 | '8', d9_1 | '9', d9_4). +-d8_8 = '-'?, ('0', d9_8 | '1', d9_1 | '2', d9_4 | '3', d9_7 | '4', d9_0 + | '5', d9_3 | '6', d9_6 | '7', d9_9 | '8', d9_2 | '9', d9_5). +-d8_9 = '-'?, ('0', d9_9 | '1', d9_2 | '2', d9_5 | '3', d9_8 | '4', d9_1 + | '5', d9_4 | '6', d9_7 | '7', d9_0 | '8', d9_3 | '9', d9_6). + + +{ d9_x: ninth digit, x is running sum, add 1 * digit } + +-d9_0 = '-'?, ('0', dA_0 | '1', dA_1 | '2', dA_2 | '3', dA_3 | '4', dA_4 + | '5', dA_5 | '6', dA_6 | '7', dA_7 | '8', dA_8 | '9', dA_9). +-d9_1 = '-'?, ('0', dA_1 | '1', dA_2 | '2', dA_3 | '3', dA_4 | '4', dA_5 + | '5', dA_6 | '6', dA_7 | '7', dA_8 | '8', dA_9 | '9', dA_0). +-d9_2 = '-'?, ('0', dA_2 | '1', dA_3 | '2', dA_4 | '3', dA_5 | '4', dA_6 + | '5', dA_7 | '6', dA_8 | '7', dA_9 | '8', dA_0 | '9', dA_1). +-d9_3 = '-'?, ('0', dA_3 | '1', dA_4 | '2', dA_5 | '3', dA_6 | '4', dA_7 + | '5', dA_8 | '6', dA_9 | '7', dA_0 | '8', dA_1 | '9', dA_2). +-d9_4 = '-'?, ('0', dA_4 | '1', dA_5 | '2', dA_6 | '3', dA_7 | '4', dA_8 + | '5', dA_9 | '6', dA_0 | '7', dA_1 | '8', dA_2 | '9', dA_3). +-d9_5 = '-'?, ('0', dA_5 | '1', dA_6 | '2', dA_7 | '3', dA_8 | '4', dA_9 + | '5', dA_0 | '6', dA_1 | '7', dA_2 | '8', dA_3 | '9', dA_4). +-d9_6 = '-'?, ('0', dA_6 | '1', dA_7 | '2', dA_8 | '3', dA_9 | '4', dA_0 + | '5', dA_1 | '6', dA_2 | '7', dA_3 | '8', dA_4 | '9', dA_5). +-d9_7 = '-'?, ('0', dA_7 | '1', dA_8 | '2', dA_9 | '3', dA_0 | '4', dA_1 + | '5', dA_2 | '6', dA_3 | '7', dA_4 | '8', dA_5 | '9', dA_6). +-d9_8 = '-'?, ('0', dA_8 | '1', dA_9 | '2', dA_0 | '3', dA_1 | '4', dA_2 + | '5', dA_3 | '6', dA_4 | '7', dA_5 | '8', dA_6 | '9', dA_7). +-d9_9 = '-'?, ('0', dA_9 | '1', dA_0 | '2', dA_1 | '3', dA_2 | '4', dA_3 + | '5', dA_4 | '6', dA_5 | '7', dA_6 | '8', dA_7 | '9', dA_8). + + +{ dA_x: tenth digit, x is running sum, add 3 * digit } + +-dA_0 = '-'?, ('0', dB_0 | '1', dB_3 | '2', dB_6 | '3', dB_9 | '4', dB_2 + | '5', dB_5 | '6', dB_8 | '7', dB_1 | '8', dB_4 | '9', dB_7). +-dA_1 = '-'?, ('0', dB_1 | '1', dB_4 | '2', dB_7 | '3', dB_0 | '4', dB_3 + | '5', dB_6 | '6', dB_9 | '7', dB_2 | '8', dB_5 | '9', dB_8). +-dA_2 = '-'?, ('0', dB_2 | '1', dB_5 | '2', dB_8 | '3', dB_1 | '4', dB_4 + | '5', dB_7 | '6', dB_0 | '7', dB_3 | '8', dB_6 | '9', dB_9). +-dA_3 = '-'?, ('0', dB_3 | '1', dB_6 | '2', dB_9 | '3', dB_2 | '4', dB_5 + | '5', dB_8 | '6', dB_1 | '7', dB_4 | '8', dB_7 | '9', dB_0). +-dA_4 = '-'?, ('0', dB_4 | '1', dB_7 | '2', dB_0 | '3', dB_3 | '4', dB_6 + | '5', dB_9 | '6', dB_2 | '7', dB_5 | '8', dB_8 | '9', dB_1). +-dA_5 = '-'?, ('0', dB_5 | '1', dB_8 | '2', dB_1 | '3', dB_4 | '4', dB_7 + | '5', dB_0 | '6', dB_3 | '7', dB_6 | '8', dB_9 | '9', dB_2). +-dA_6 = '-'?, ('0', dB_6 | '1', dB_9 | '2', dB_2 | '3', dB_5 | '4', dB_8 + | '5', dB_1 | '6', dB_4 | '7', dB_7 | '8', dB_0 | '9', dB_3). +-dA_7 = '-'?, ('0', dB_7 | '1', dB_0 | '2', dB_3 | '3', dB_6 | '4', dB_9 + | '5', dB_2 | '6', dB_5 | '7', dB_8 | '8', dB_1 | '9', dB_4). +-dA_8 = '-'?, ('0', dB_8 | '1', dB_1 | '2', dB_4 | '3', dB_7 | '4', dB_0 + | '5', dB_3 | '6', dB_6 | '7', dB_9 | '8', dB_2 | '9', dB_5). +-dA_9 = '-'?, ('0', dB_9 | '1', dB_2 | '2', dB_5 | '3', dB_8 | '4', dB_1 + | '5', dB_4 | '6', dB_7 | '7', dB_0 | '8', dB_3 | '9', dB_6). + + +{ dB: eleventh digit, x is running sum, add 1 * digit } + +-dB_0 = '-'?, ('0', dC_0 | '1', dC_1 | '2', dC_2 | '3', dC_3 | '4', dC_4 + | '5', dC_5 | '6', dC_6 | '7', dC_7 | '8', dC_8 | '9', dC_9). +-dB_1 = '-'?, ('0', dC_1 | '1', dC_2 | '2', dC_3 | '3', dC_4 | '4', dC_5 + | '5', dC_6 | '6', dC_7 | '7', dC_8 | '8', dC_9 | '9', dC_0). +-dB_2 = '-'?, ('0', dC_2 | '1', dC_3 | '2', dC_4 | '3', dC_5 | '4', dC_6 + | '5', dC_7 | '6', dC_8 | '7', dC_9 | '8', dC_0 | '9', dC_1). +-dB_3 = '-'?, ('0', dC_3 | '1', dC_4 | '2', dC_5 | '3', dC_6 | '4', dC_7 + | '5', dC_8 | '6', dC_9 | '7', dC_0 | '8', dC_1 | '9', dC_2). +-dB_4 = '-'?, ('0', dC_4 | '1', dC_5 | '2', dC_6 | '3', dC_7 | '4', dC_8 + | '5', dC_9 | '6', dC_0 | '7', dC_1 | '8', dC_2 | '9', dC_3). +-dB_5 = '-'?, ('0', dC_5 | '1', dC_6 | '2', dC_7 | '3', dC_8 | '4', dC_9 + | '5', dC_0 | '6', dC_1 | '7', dC_2 | '8', dC_3 | '9', dC_4). +-dB_6 = '-'?, ('0', dC_6 | '1', dC_7 | '2', dC_8 | '3', dC_9 | '4', dC_0 + | '5', dC_1 | '6', dC_2 | '7', dC_3 | '8', dC_4 | '9', dC_5). +-dB_7 = '-'?, ('0', dC_7 | '1', dC_8 | '2', dC_9 | '3', dC_0 | '4', dC_1 + | '5', dC_2 | '6', dC_3 | '7', dC_4 | '8', dC_5 | '9', dC_6). +-dB_8 = '-'?, ('0', dC_8 | '1', dC_9 | '2', dC_0 | '3', dC_1 | '4', dC_2 + | '5', dC_3 | '6', dC_4 | '7', dC_5 | '8', dC_6 | '9', dC_7). +-dB_9 = '-'?, ('0', dC_9 | '1', dC_0 | '2', dC_1 | '3', dC_2 | '4', dC_3 + | '5', dC_4 | '6', dC_5 | '7', dC_6 | '8', dC_7 | '9', dC_8). + + +{ dC: 12th digit. Hand edit needed ... } + +-dC_0 = '-'?, ('0', dD_0 | '1', dD_3 | '2', dD_6 | '3', dD_9 | '4', dD_2 + | '5', dD_5 | '6', dD_8 | '7', dD_1 | '8', dD_4 | '9', dD_7). +-dC_1 = '-'?, ('0', dD_1 | '1', dD_4 | '2', dD_7 | '3', dD_0 | '4', dD_3 + | '5', dD_6 | '6', dD_9 | '7', dD_2 | '8', dD_5 | '9', dD_8). +-dC_2 = '-'?, ('0', dD_2 | '1', dD_5 | '2', dD_8 | '3', dD_1 | '4', dD_4 + | '5', dD_7 | '6', dD_0 | '7', dD_3 | '8', dD_6 | '9', dD_9). +-dC_3 = '-'?, ('0', dD_3 | '1', dD_6 | '2', dD_9 | '3', dD_2 | '4', dD_5 + | '5', dD_8 | '6', dD_1 | '7', dD_4 | '8', dD_7 | '9', dD_0). +-dC_4 = '-'?, ('0', dD_4 | '1', dD_7 | '2', dD_0 | '3', dD_3 | '4', dD_6 + | '5', dD_9 | '6', dD_2 | '7', dD_5 | '8', dD_8 | '9', dD_1). +-dC_5 = '-'?, ('0', dD_5 | '1', dD_8 | '2', dD_1 | '3', dD_4 | '4', dD_7 + | '5', dD_0 | '6', dD_3 | '7', dD_6 | '8', dD_9 | '9', dD_2). +-dC_6 = '-'?, ('0', dD_6 | '1', dD_9 | '2', dD_2 | '3', dD_5 | '4', dD_8 + | '5', dD_1 | '6', dD_4 | '7', dD_7 | '8', dD_0 | '9', dD_3). +-dC_7 = '-'?, ('0', dD_7 | '1', dD_0 | '2', dD_3 | '3', dD_6 | '4', dD_9 + | '5', dD_2 | '6', dD_5 | '7', dD_8 | '8', dD_1 | '9', dD_4). +-dC_8 = '-'?, ('0', dD_8 | '1', dD_1 | '2', dD_4 | '3', dD_7 | '4', dD_0 + | '5', dD_3 | '6', dD_6 | '7', dD_9 | '8', dD_2 | '9', dD_5). +-dC_9 = '-'?, ('0', dD_9 | '1', dD_2 | '2', dD_5 | '3', dD_8 | '4', dD_1 + | '5', dD_4 | '6', dD_7 | '7', dD_0 | '8', dD_3 | '9', dD_6). + +{ dD: 13th digit. This is the check digit. } + +-dD_0 = '-'?, '0', ok. +-dD_1 = '-'?, '9', ok. +-dD_2 = '-'?, '8', ok. +-dD_3 = '-'?, '7', ok. +-dD_4 = '-'?, '6', ok. +-dD_5 = '-'?, '5', ok. +-dD_6 = '-'?, '4', ok. +-dD_7 = '-'?, '3', ok. +-dD_8 = '-'?, '2', ok. +-dD_9 = '-'?, '1', ok. + +-ok = (). \ No newline at end of file diff --git a/samples/ISBN/README.md b/samples/ISBN/README.md new file mode 100644 index 00000000..5d8325d9 --- /dev/null +++ b/samples/ISBN/README.md @@ -0,0 +1,99 @@ +# International Standard Book Numbers + +International Standard Book Numbers (ISBNs) are, at one level, quite +simple: in their current form they are strings of 13 digits, with +optional hyphens marking the boundaries between segments of the +number. The final digit is a check digit which helps make sure that +the rest of the number has been given correctly. + +Since the segments for the registration authority (usually a national +or regional body which manages a registry of publishers), the +registrant (usually a publisher), and publication (the book being +identified) are all variable in length, getting the hyphens in the +right positions is tricky and requires knowledge of what numbers have +been asssigned in different registries. Like most tools for checking +ISBNs, this grammar makes no pretense of checking that the hyphens are +in the right places, or even that there are the expected number (none, +or four, if I understand the rules right). + +The check digit is used to enable a simple form of validation for +ISBNs. Each digit in the number is assigned a weight: one for the +first, third, and other odd-nmbered digits, three for the +even-numbered digits. If each digit in the ISBN is multipled by its +weight (1 or 3, in alternation) and the results are added together, +the sum must be evenly divisible by ten, or else the ISBN is not +valid. + +For example, the ISBN 9781441919014 can be validated as follows: +```` + 9 * 1 + + 7 * 3 + + 8 * 1 + + 1 * 3 + + 4 * 1 + + 4 * 3 + + 1 * 1 + + 9 * 3 + + 1 * 1 + + 9 * 3 + + 0 * 1 + + 1 * 3 + + 4 + 1 + _______ + + = 120 +```` +Since 120 is evenly divisible by 10, the number is valid. + +Alternatively, given the first 12 digits of the ISBN, the check digit +for an ISBN-13 number can be calculated as follows: multiply the first +twelve digits of the number by their weights, sum the resulting +numbers, discard all but the last digit, subtract that last digit from +10, and take the last digit from the result (i.e. if the result is 10, +use zero). The resulting single-digit number is the correct check +digit. + +The obvious way to do this is with a simple program, so it may be +surprising that in fact the set of thirteen-digit numbers with a +correct check digit is a regular language and can be described either +with a (rather large) regular expression or with a finite state +automaton. + +The basic principle is perhaps most easily seen with a simpler +example: here is a finite state automaton which recognizes base-10 +numerals which are evenly divisible by three: + +![FSA recognizing numerals divisible by three](fsa-3.dot.png) + +The FSA recognizes the numeral "12", for example, by starting in state +0, going to state 1 on the first digit by following the arc labeled +"1", then going back to state 0 on the second digit by following the +arc labeled "2". + +An FSA to keep track of divisibility by any given modulus (e.g. 10 for +ISBN-13 numbers, 11 for ISBN-10 numbers and ISSNs) will have more +states but is otherwise similar. If we must multiply the next digit +by a weight, the labels on the arcs will be different and we will need +more states to keep track of which weight to use next, but again the +principles are similar. For ISBN-13 numbers, which use a modulus of +10 and two alternating weights, an FSA of twenty states would suffice, +except that we also need to count the digits: the first and the fifth +digit of the ISBN may use the same weight, but our grammar needs to +keep them distinct. + +Any finite state automaton can be turned into a context-free grammar +in which each rule has a single nonterminal (a state name) on the left +hand side, and one or more right-hand sides each of which is either +empty or has the form *t, N*, where *t* is a terminal symbol and *N* +is the next state. Rules of the form *N1 = t N2* correspond to a +transition from state *N1* to state *N2* on symbol *t* in the finite +state automaton; an empty right-hand side signals that the state is a +final (or: accept) state. + +The grammar in ISBN.ixml can thus be read as a relatively +straightforward description of a finite state automaton to recognize +valid ISBN-13 numbers (with some complications to allow hyphens to +occur). In addition to any practical utility it may have, it +illustrates the use of ixml to describe finite state automata, and the +use of finite state automata to perform modular arithmetic +calculations. diff --git a/samples/ISBN/fsa-3.dot b/samples/ISBN/fsa-3.dot new file mode 100644 index 00000000..1b517ccb --- /dev/null +++ b/samples/ISBN/fsa-3.dot @@ -0,0 +1,16 @@ +digraph divisible_by_3 { + start [peripheries=0]; + 0 [peripheries=2]; 1; 2; + start -> 0; + 0 -> 0 [label="0, 3, 6, 9"]; + 0 -> 1 [label="1, 4, 7"]; + 0 -> 2 [label="2, 5, 8"]; + + 1 -> 1 [label="0, 3, 6, 9"]; + 1 -> 2 [label="1, 4, 7"]; + 1 -> 0 [label="2, 5, 8"]; + + 2 -> 2 [label="0, 3, 6, 9"]; + 2 -> 0 [label="1, 4, 7"]; + 2 -> 1 [label="2, 5, 8"]; +} \ No newline at end of file diff --git a/samples/ISBN/fsa-3.dot.png b/samples/ISBN/fsa-3.dot.png new file mode 100644 index 00000000..84e4aee8 Binary files /dev/null and b/samples/ISBN/fsa-3.dot.png differ diff --git a/samples/ISBN/fsa-3.dot.svg b/samples/ISBN/fsa-3.dot.svg new file mode 100644 index 00000000..ab2931c1 --- /dev/null +++ b/samples/ISBN/fsa-3.dot.svg @@ -0,0 +1,106 @@ + + + + + + +divisible_by_3 + + + +start +start + + + +0 + + +0 + + + +start->0 + + + + + +0->0 + + +0, 3, 6, 9 + + + +1 + +1 + + + +0->1 + + +1, 4, 7 + + + +2 + +2 + + + +0->2 + + +2, 5, 8 + + + +1->0 + + +2, 5, 8 + + + +1->1 + + +0, 3, 6, 9 + + + +1->2 + + +1, 4, 7 + + + +2->0 + + +1, 4, 7 + + + +2->1 + + +2, 5, 8 + + + +2->2 + + +0, 3, 6, 9 + + + diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 00000000..6c917601 --- /dev/null +++ b/samples/README.md @@ -0,0 +1,30 @@ +# Samples directory + +This directory contains a collection of ixml grammars for various +languages or notations for which some more or less authoritative +grammar or syntax description is available. + +These grammars are offered as illustrations of ixml which may be +useful in real applications, and due care has been exercised in +preparing them, but no guarantees of accuracy or fitness for purpose +are offered. + +Note that it is not always possible for an ixml grammar to match a +published language perfectly: many published language descriptions +assume a lexical scanner (which may for example distinguish +identifiers from reserved words), and some may require ad-hoc checks +that are not feasible in a pure context-free grammar. So do not +assume without checking that the language accepted by the ixml grammar +here matches exactly the language defined by the relevant source. + +Samples in this directory include: + +* ISBN: a grammar for International Standard Book Numbers and + International Standard Serial numbers that checks to make sure the + check digit is correct. (Currently only ISBN-13; ISBN-10 and ISSN + coming soon.) + +* ABNF: an ixml version of the grammar notation used in IETF + Requests for Comment, as defined in RFC. (Coming real soon now.) + + diff --git a/tests/ambiguous/ambig.alt.output.xml b/tests/ambiguous/ambig.alt.output.xml new file mode 100644 index 00000000..3c16dfaa --- /dev/null +++ b/tests/ambiguous/ambig.alt.output.xml @@ -0,0 +1,15 @@ + + +i+i+i diff --git a/tests/ambiguous/ambig.inp b/tests/ambiguous/ambig.inp new file mode 100644 index 00000000..abb079ae --- /dev/null +++ b/tests/ambiguous/ambig.inp @@ -0,0 +1 @@ +i+i+i \ No newline at end of file diff --git a/tests/ambiguous/ambig.ixml b/tests/ambiguous/ambig.ixml new file mode 100644 index 00000000..71085799 --- /dev/null +++ b/tests/ambiguous/ambig.ixml @@ -0,0 +1,3 @@ +expr: e. +e: e, "+", e; e, "-", e; "i". + diff --git a/tests/ambiguous/ambig.output.xml b/tests/ambiguous/ambig.output.xml new file mode 100644 index 00000000..1639e399 --- /dev/null +++ b/tests/ambiguous/ambig.output.xml @@ -0,0 +1,15 @@ + + +i+i+i diff --git a/tests/ambiguous/ambig2.alt.output.xml b/tests/ambiguous/ambig2.alt.output.xml new file mode 100644 index 00000000..2304ab88 --- /dev/null +++ b/tests/ambiguous/ambig2.alt.output.xml @@ -0,0 +1,7 @@ + + diff --git a/tests/ambiguous/ambig2.inp b/tests/ambiguous/ambig2.inp new file mode 100644 index 00000000..e69de29b diff --git a/tests/ambiguous/ambig2.ixml b/tests/ambiguous/ambig2.ixml new file mode 100644 index 00000000..07a22f2a --- /dev/null +++ b/tests/ambiguous/ambig2.ixml @@ -0,0 +1 @@ +a: "a"* ; "b"*. diff --git a/tests/ambiguous/ambig2.output.xml b/tests/ambiguous/ambig2.output.xml new file mode 100644 index 00000000..566095da --- /dev/null +++ b/tests/ambiguous/ambig2.output.xml @@ -0,0 +1,7 @@ + + + diff --git a/tests/ambiguous/ambig3.alt-2.output.xml b/tests/ambiguous/ambig3.alt-2.output.xml new file mode 100644 index 00000000..2fa28db9 --- /dev/null +++ b/tests/ambiguous/ambig3.alt-2.output.xml @@ -0,0 +1,12 @@ + +a b diff --git a/tests/ambiguous/ambig3.alt-3.output.xml b/tests/ambiguous/ambig3.alt-3.output.xml new file mode 100644 index 00000000..5cee17c8 --- /dev/null +++ b/tests/ambiguous/ambig3.alt-3.output.xml @@ -0,0 +1,12 @@ + +a b diff --git a/tests/ambiguous/ambig3.alt.output.xml b/tests/ambiguous/ambig3.alt.output.xml new file mode 100644 index 00000000..5f56c4f2 --- /dev/null +++ b/tests/ambiguous/ambig3.alt.output.xml @@ -0,0 +1,12 @@ + +a b diff --git a/tests/ambiguous/ambig3.inp b/tests/ambiguous/ambig3.inp new file mode 100644 index 00000000..7d36a38e --- /dev/null +++ b/tests/ambiguous/ambig3.inp @@ -0,0 +1 @@ +a b \ No newline at end of file diff --git a/tests/ambiguous/ambig3.ixml b/tests/ambiguous/ambig3.ixml new file mode 100644 index 00000000..2318c34c --- /dev/null +++ b/tests/ambiguous/ambig3.ixml @@ -0,0 +1,3 @@ +a: "a", spaces, b. +b: spaces, "b". +spaces: " "*. diff --git a/tests/ambiguous/ambig3.output.xml b/tests/ambiguous/ambig3.output.xml new file mode 100644 index 00000000..6edb2aad --- /dev/null +++ b/tests/ambiguous/ambig3.output.xml @@ -0,0 +1,12 @@ + +a b diff --git a/tests/ambiguous/ambig4.alt.output.xml b/tests/ambiguous/ambig4.alt.output.xml new file mode 100644 index 00000000..33b1fc74 --- /dev/null +++ b/tests/ambiguous/ambig4.alt.output.xml @@ -0,0 +1,9 @@ +red=#f00 +green=#0f0 + diff --git a/tests/ambiguous/ambig4.inp b/tests/ambiguous/ambig4.inp new file mode 100644 index 00000000..2f715528 --- /dev/null +++ b/tests/ambiguous/ambig4.inp @@ -0,0 +1,2 @@ +red=#f00 +green=#0f0 diff --git a/tests/ambiguous/ambig4.ixml b/tests/ambiguous/ambig4.ixml new file mode 100644 index 00000000..0d7a38f3 --- /dev/null +++ b/tests/ambiguous/ambig4.ixml @@ -0,0 +1,5 @@ +properties: property+. +property: name, "=", value+",", eol. +name: [L]+. +value: ~[","]+. +eol: -#a. diff --git a/tests/ambiguous/ambig4.output.xml b/tests/ambiguous/ambig4.output.xml new file mode 100644 index 00000000..413d8c2c --- /dev/null +++ b/tests/ambiguous/ambig4.output.xml @@ -0,0 +1,17 @@ + +red=#f00green=#0f0 diff --git a/tests/ambiguous/ambig5.alt.output.xml b/tests/ambiguous/ambig5.alt.output.xml new file mode 100644 index 00000000..55275b60 --- /dev/null +++ b/tests/ambiguous/ambig5.alt.output.xml @@ -0,0 +1,8 @@ + +12 diff --git a/tests/ambiguous/ambig5.inp b/tests/ambiguous/ambig5.inp new file mode 100644 index 00000000..3cacc0b9 --- /dev/null +++ b/tests/ambiguous/ambig5.inp @@ -0,0 +1 @@ +12 \ No newline at end of file diff --git a/tests/ambiguous/ambig5.ixml b/tests/ambiguous/ambig5.ixml new file mode 100644 index 00000000..fd589d5d --- /dev/null +++ b/tests/ambiguous/ambig5.ixml @@ -0,0 +1,3 @@ +data: number+. +number: digit+. +digit: ["0"-"9"]. diff --git a/tests/ambiguous/ambig5.output.xml b/tests/ambiguous/ambig5.output.xml new file mode 100644 index 00000000..d5c84c58 --- /dev/null +++ b/tests/ambiguous/ambig5.output.xml @@ -0,0 +1,13 @@ + +12 diff --git a/tests/ambiguous/ambig6.alt.output.xml b/tests/ambiguous/ambig6.alt.output.xml new file mode 100644 index 00000000..f00f9b90 --- /dev/null +++ b/tests/ambiguous/ambig6.alt.output.xml @@ -0,0 +1,7 @@ + + +{} diff --git a/tests/ambiguous/ambig6.inp b/tests/ambiguous/ambig6.inp new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/tests/ambiguous/ambig6.inp @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/tests/ambiguous/ambig6.ixml b/tests/ambiguous/ambig6.ixml new file mode 100644 index 00000000..f0731182 --- /dev/null +++ b/tests/ambiguous/ambig6.ixml @@ -0,0 +1,4 @@ +block: "{", rule*";", "}". +rule: name, "=", value; . +name: [L]+. +value: [Nd]+. diff --git a/tests/ambiguous/ambig6.output.xml b/tests/ambiguous/ambig6.output.xml new file mode 100644 index 00000000..82f708f5 --- /dev/null +++ b/tests/ambiguous/ambig6.output.xml @@ -0,0 +1,7 @@ + + +{} diff --git a/tests/ambiguous/ambig7.alt.output.xml b/tests/ambiguous/ambig7.alt.output.xml new file mode 100644 index 00000000..2d976f22 --- /dev/null +++ b/tests/ambiguous/ambig7.alt.output.xml @@ -0,0 +1 @@ +bc diff --git a/tests/ambiguous/ambig7.inp b/tests/ambiguous/ambig7.inp new file mode 100644 index 00000000..e5d8f44b --- /dev/null +++ b/tests/ambiguous/ambig7.inp @@ -0,0 +1 @@ +bc \ No newline at end of file diff --git a/tests/ambiguous/ambig7.ixml b/tests/ambiguous/ambig7.ixml new file mode 100644 index 00000000..9e8392ba --- /dev/null +++ b/tests/ambiguous/ambig7.ixml @@ -0,0 +1,3 @@ +a: b, ()?, c. +b: "b". +c: "c". diff --git a/tests/ambiguous/ambig7.output.xml b/tests/ambiguous/ambig7.output.xml new file mode 100644 index 00000000..f0b5c009 --- /dev/null +++ b/tests/ambiguous/ambig7.output.xml @@ -0,0 +1 @@ +bc diff --git a/tests/ambiguous/css.alt.output.xml b/tests/ambiguous/css.alt.output.xml new file mode 100644 index 00000000..9ac6c874 --- /dev/null +++ b/tests/ambiguous/css.alt.output.xml @@ -0,0 +1,27 @@ +bodyhead diff --git a/tests/ambiguous/css.inp b/tests/ambiguous/css.inp new file mode 100644 index 00000000..1f2c561f --- /dev/null +++ b/tests/ambiguous/css.inp @@ -0,0 +1,2 @@ +body { color: blue; font-weight: bold; margin: 0; } +head {} diff --git a/tests/ambiguous/css.ixml b/tests/ambiguous/css.ixml new file mode 100644 index 00000000..987b8c26 --- /dev/null +++ b/tests/ambiguous/css.ixml @@ -0,0 +1,12 @@ +css: S, rule+. +rule: selector, block. +block: -"{", S, property*(-";", S), -"}", S. +property: @name, S, -":", S, value; empty. +selector: name, S. +name: letter+. +-letter: ["a"-"z"; "-"]. +digit: ["0"-"9"]. +value: (@name; @number), S. +number: digit+. +-empty: . +-S: -[" "; #a]*. diff --git a/tests/ambiguous/css.output.xml b/tests/ambiguous/css.output.xml new file mode 100644 index 00000000..f7b063d1 --- /dev/null +++ b/tests/ambiguous/css.output.xml @@ -0,0 +1,32 @@ + + +bodyhead diff --git a/tests/ambiguous/date.alt.output.xml b/tests/ambiguous/date.alt.output.xml new file mode 100644 index 00000000..bb8df532 --- /dev/null +++ b/tests/ambiguous/date.alt.output.xml @@ -0,0 +1,14 @@ + + +The conference starts Thursday, 9th Feb 2017 at 10:00. diff --git a/tests/ambiguous/date.inp b/tests/ambiguous/date.inp new file mode 100644 index 00000000..5df1d355 --- /dev/null +++ b/tests/ambiguous/date.inp @@ -0,0 +1 @@ +The conference starts Thursday, 9th Feb 2017 at 10:00. \ No newline at end of file diff --git a/tests/ambiguous/date.ixml b/tests/ambiguous/date.ixml new file mode 100644 index 00000000..15da79c0 --- /dev/null +++ b/tests/ambiguous/date.ixml @@ -0,0 +1,38 @@ +date: bla, day, S, number, S, month, S, year, bla; + bla, number, S, month, S, year, bla; + bla, day, ",", S, number, S, month, S, year, bla. +day: Monday; Tuesday; Wednesday; Thursday; Friday; Saturday; Sunday. +Monday: "Mon"; "Monday". +Tuesday: "Tue"; "Tuesday". +Wednesday: "Wed"; "Weds"; "Wednesday". +Thursday: "Thu"; "Thur"; "Thurs"; "Thursday". +Friday: "Fri"; "Friday". +Saturday: "Sat"; "Saturday". +Sunday: "Sun"; "Sunday". +number: cardinal; ordinal. +cardinal: digits1, "st"; digits2, "nd"; digits3, "rd"; digits4,"th". +ordinal: digits. +-digits: digit+. +-digit: ["0"-"9"]. +-digits1: "1"; digits, "1". +-digits2: "2"; digits, "2". +-digits3: "3"; digits, "3". +-digits4: digit4; digits, digit4. +-digit4: ["4"-"9"]; "0". +-bla: ; char, bla. +-char: [" "-"~"]. +month: January; February; March; April; May; June; July; August; September; October; November; December. +year: digit, digit, digit, digit. +January: "January"; "Jan". +February: "February"; "Feb". +March: "March"; "Mar". +April: "April"; "Apr". +May: "May". +June: "June"; "Jun". +July: "July"; "Jul". +August: "August"; "Aug". +September: "September"; "Sept"; "Sep". +October: "October"; "Oct". +November: "November"; "Nov". +December: "December"; "Dec". +-S: " "*. diff --git a/tests/ambiguous/date.output.xml b/tests/ambiguous/date.output.xml new file mode 100644 index 00000000..0b30edb0 --- /dev/null +++ b/tests/ambiguous/date.output.xml @@ -0,0 +1,16 @@ + + +The conference starts Thursday, 9th Feb 2017 at 10:00. diff --git a/tests/ambiguous/empty-parens.inp b/tests/ambiguous/empty-parens.inp new file mode 100644 index 00000000..e69de29b diff --git a/tests/ambiguous/empty-parens.ixml b/tests/ambiguous/empty-parens.ixml new file mode 100644 index 00000000..295f1c8f --- /dev/null +++ b/tests/ambiguous/empty-parens.ixml @@ -0,0 +1 @@ +a: "a"; ()?. diff --git a/tests/ambiguous/empty-parens.output.xml b/tests/ambiguous/empty-parens.output.xml new file mode 100644 index 00000000..d80a5e27 --- /dev/null +++ b/tests/ambiguous/empty-parens.output.xml @@ -0,0 +1 @@ + diff --git a/tests/ambiguous/expr0.alt.output.xml b/tests/ambiguous/expr0.alt.output.xml new file mode 100644 index 00000000..12ba0c4c --- /dev/null +++ b/tests/ambiguous/expr0.alt.output.xml @@ -0,0 +1,15 @@ + + +a÷b÷c diff --git a/tests/ambiguous/expr0.inp b/tests/ambiguous/expr0.inp new file mode 100644 index 00000000..3a2ccaf2 --- /dev/null +++ b/tests/ambiguous/expr0.inp @@ -0,0 +1 @@ +a÷b÷c \ No newline at end of file diff --git a/tests/ambiguous/expr0.ixml b/tests/ambiguous/expr0.ixml new file mode 100644 index 00000000..a6a1d7ce --- /dev/null +++ b/tests/ambiguous/expr0.ixml @@ -0,0 +1,14 @@ +{expr: operand+operator. +operand: id; number. +-id: @name. +name: letter+. +-number: @value. +value: digit+. +letter: ["a"-"z"]. +digit: ["0"-"9"]. +operator: "+"; "-"; "×"; "÷". + } +expr: id; number; expr, operator, expr. +id: ["a"-"z"]+. +number: ["0"-"9"]+. +operator: "+"; "-"; "×"; "÷". diff --git a/tests/ambiguous/expr0.output.xml b/tests/ambiguous/expr0.output.xml new file mode 100644 index 00000000..e0ede602 --- /dev/null +++ b/tests/ambiguous/expr0.output.xml @@ -0,0 +1,21 @@ + + +a÷b÷c diff --git a/tests/ambiguous/lf2.alt.output.xml b/tests/ambiguous/lf2.alt.output.xml new file mode 100644 index 00000000..404521ac --- /dev/null +++ b/tests/ambiguous/lf2.alt.output.xml @@ -0,0 +1,10 @@ + +Now is the timeFor all good peopleTo have fun. diff --git a/tests/ambiguous/lf2.inp b/tests/ambiguous/lf2.inp new file mode 100644 index 00000000..984acbd1 --- /dev/null +++ b/tests/ambiguous/lf2.inp @@ -0,0 +1,3 @@ +Now is the time +For all good people +To have fun. diff --git a/tests/ambiguous/lf2.ixml b/tests/ambiguous/lf2.ixml new file mode 100644 index 00000000..ed08b3ae --- /dev/null +++ b/tests/ambiguous/lf2.ixml @@ -0,0 +1,3 @@ +input: line+lf, lf?. +line: ~[#a]*. +lf: -#a. diff --git a/tests/ambiguous/lf2.output.xml b/tests/ambiguous/lf2.output.xml new file mode 100644 index 00000000..ab42c864 --- /dev/null +++ b/tests/ambiguous/lf2.output.xml @@ -0,0 +1,10 @@ + +Now is the timeFor all good peopleTo have fun. diff --git a/tests/ambiguous/test-catalog.xml b/tests/ambiguous/test-catalog.xml new file mode 100644 index 00000000..609253af --- /dev/null +++ b/tests/ambiguous/test-catalog.xml @@ -0,0 +1,271 @@ + + + +

Tests provided by Steven Pemberton in December 2021, + with corrections of 21 December. Reorganized by Norm Tovey-Walsh, February 2022.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + +

The grammar is a: "a"* ; "b"*. + and the input is the empty string.

+

If the grammar is rewritten to BNF, it + will clearly have two parse trees against + the BNF grammar, but it does not have + two parse trees against the EBNF grammar.

+

So MSM has added an alternative result + which does not mark the + result ambiguous.

+

What the spec should say will require + careful discussion.

+
+ + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

The grammar is a: b, ()?, c. b: "b". c: "c". + and the input is bc.

+

As with ambig2, this is or is not ambiguous depending on + exactly how we define ambiguity. The parse may interpret + the middle term as present or as absent, so there are two + derivations, or may be depending on how we define + 'derivation'. There is only one parse tree for the EBNF + grammar.

+

So MSM has specified two alternative results + which differ only in including or excluding + the ambiguity flag.

+

What the spec should say will require + careful discussion.

+
+ + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/tests/correct/address.inp b/tests/correct/address.inp new file mode 100644 index 00000000..11a9245a --- /dev/null +++ b/tests/correct/address.inp @@ -0,0 +1,4 @@ +Steven Pemberton +21 Sandridge Road +St Albans AL1 4BY +United Kingdom diff --git a/tests/correct/address.ixml b/tests/correct/address.ixml new file mode 100644 index 00000000..629c4a5c --- /dev/null +++ b/tests/correct/address.ixml @@ -0,0 +1,22 @@ +address: person, lf, street, lf, postcode, city, lf, country, lf; + person, lf, street, lf, city, postcode, lf, country, lf. +person: (title, S?)?, (initials; given, S), surname, S?. +title: "Mr."; "Mrs."; "Dr."; "Ms.". +initials: initial+. +initial: LETTER, ".", S?. +surname: name. +given: name. +-name: LETTER, letters. +street: no, S?, streetname; streetname, S?, no, S?. +streetname: name, S; name, S, name. +city: name, S; name, S, name, S. +country: name, S?; name, S, name, S?. +postcode: digits, S, LETTER, LETTER, S?; + LETTER, LETTER, digits, S, digit, LETTER, LETTER, S?. +no: digits. +-LETTER: ["A"-"Z"]. +-letters: ["a"-"z"]*. +-digit: ["0"-"9"]. +-digits: ["0"-"9"]+. +-S: " "+. +-lf: -#a. diff --git a/tests/correct/address.output.xml b/tests/correct/address.output.xml new file mode 100644 index 00000000..f72dfb4f --- /dev/null +++ b/tests/correct/address.output.xml @@ -0,0 +1,14 @@ + +
Steven Pemberton21 Sandridge RoadSt Albans AL1 4BYUnited Kingdom
diff --git a/tests/correct/arith.inp b/tests/correct/arith.inp new file mode 100644 index 00000000..313bb560 --- /dev/null +++ b/tests/correct/arith.inp @@ -0,0 +1 @@ +(a+b) \ No newline at end of file diff --git a/tests/correct/arith.ixml b/tests/correct/arith.ixml new file mode 100644 index 00000000..446c4696 --- /dev/null +++ b/tests/correct/arith.ixml @@ -0,0 +1,8 @@ +expr: open, -arith, close. +open: "(". +close: ")". +arith: left, -op, right. +op: @plus. +plus: "+". +left: "a". +right:"b". diff --git a/tests/correct/arith.output.xml b/tests/correct/arith.output.xml new file mode 100644 index 00000000..1e1de874 --- /dev/null +++ b/tests/correct/arith.output.xml @@ -0,0 +1,6 @@ +(ab) diff --git a/tests/correct/attribute-value.inp b/tests/correct/attribute-value.inp new file mode 100644 index 00000000..2202320c --- /dev/null +++ b/tests/correct/attribute-value.inp @@ -0,0 +1 @@ +"'<>/&. \ No newline at end of file diff --git a/tests/correct/attribute-value.ixml b/tests/correct/attribute-value.ixml new file mode 100644 index 00000000..3ed6c51e --- /dev/null +++ b/tests/correct/attribute-value.ixml @@ -0,0 +1,2 @@ +test: a, ".". +@a: ~["."]*. diff --git a/tests/correct/attribute-value.output.xml b/tests/correct/attribute-value.output.xml new file mode 100644 index 00000000..da3111f0 --- /dev/null +++ b/tests/correct/attribute-value.output.xml @@ -0,0 +1,2 @@ + +. diff --git a/tests/correct/diary.inp b/tests/correct/diary.inp new file mode 100644 index 00000000..216fcfe1 --- /dev/null +++ b/tests/correct/diary.inp @@ -0,0 +1,9 @@ +24 December 2021 +Panic shopping! Panic packing! Will we make it before midnight? + +25 December 2021 +Food! Presents! + +26 December 2021 +Groan. + diff --git a/tests/correct/diary.ixml b/tests/correct/diary.ixml new file mode 100644 index 00000000..49c02709 --- /dev/null +++ b/tests/correct/diary.ixml @@ -0,0 +1,15 @@ +diary: entry+. +entry: date, para. +date: day, s, month, s, year, -#a. +day: digit, digit?. +-digit:["0"-"9"]. +month: "January"; "February"; "March"; "April"; "May"; "June"; + "July"; "August"; "September"; "October"; "November"; "December". +year: digit, digit, digit, digit. + +para: word+s, s?, blank. +-blank: -#a, -#a. +-word: (letter; punctuation)+. +-letter: [L]. +-punctuation: [".;:,'?!"]. +-s: " "+. diff --git a/tests/correct/diary.output.xml b/tests/correct/diary.output.xml new file mode 100644 index 00000000..a26812cb --- /dev/null +++ b/tests/correct/diary.output.xml @@ -0,0 +1,27 @@ + +24 December 2021Panic shopping! Panic packing! Will we make it before midnight?25 December 2021Food! Presents!26 December 2021Groan. diff --git a/tests/correct/diary2.inp b/tests/correct/diary2.inp new file mode 100644 index 00000000..216fcfe1 --- /dev/null +++ b/tests/correct/diary2.inp @@ -0,0 +1,9 @@ +24 December 2021 +Panic shopping! Panic packing! Will we make it before midnight? + +25 December 2021 +Food! Presents! + +26 December 2021 +Groan. + diff --git a/tests/correct/diary2.ixml b/tests/correct/diary2.ixml new file mode 100644 index 00000000..f1900a03 --- /dev/null +++ b/tests/correct/diary2.ixml @@ -0,0 +1,15 @@ +diary: entry+. +entry: date, para. +date: day, s, month, s, year, -#a. +day: digit, digit?. +-digit:["0"-"9"]. +month: "January"; "February"; "March"; "April"; "May"; "June"; + "July"; "August"; "September"; "October"; "November"; "December". +year: digit, digit, digit, digit. + +para: char*, blank. +-blank: -#a, -#a. +-char: letter; punctuation; s. +-letter: [L]. +-punctuation: [".;:,'?!"]. +-s: " ". diff --git a/tests/correct/diary2.output.xml b/tests/correct/diary2.output.xml new file mode 100644 index 00000000..a26812cb --- /dev/null +++ b/tests/correct/diary2.output.xml @@ -0,0 +1,27 @@ + +24 December 2021Panic shopping! Panic packing! Will we make it before midnight?25 December 2021Food! Presents!26 December 2021Groan. diff --git a/tests/correct/diary3.inp b/tests/correct/diary3.inp new file mode 100644 index 00000000..216fcfe1 --- /dev/null +++ b/tests/correct/diary3.inp @@ -0,0 +1,9 @@ +24 December 2021 +Panic shopping! Panic packing! Will we make it before midnight? + +25 December 2021 +Food! Presents! + +26 December 2021 +Groan. + diff --git a/tests/correct/diary3.ixml b/tests/correct/diary3.ixml new file mode 100644 index 00000000..cc1f24f6 --- /dev/null +++ b/tests/correct/diary3.ixml @@ -0,0 +1,13 @@ +diary: entry+. +entry: date, para. +date: day, s, month, s, year, -#a. +-s: -" "+. +day: digit, digit?. +-digit:["0"-"9"]. +month: "January"; "February"; "March"; "April"; "May"; "June"; + "July"; "August"; "September"; "October"; "November"; "December". +year: digit, digit, digit, digit. + +para: char*, blank. +-blank: -#a, -#a. +-char: ~[#a]. diff --git a/tests/correct/diary3.output.xml b/tests/correct/diary3.output.xml new file mode 100644 index 00000000..283cf807 --- /dev/null +++ b/tests/correct/diary3.output.xml @@ -0,0 +1,26 @@ +24December2021Panic shopping! Panic packing! Will we make it before midnight?25December2021Food! Presents!26December2021Groan. diff --git a/tests/correct/element-content.inp b/tests/correct/element-content.inp new file mode 100644 index 00000000..2202320c --- /dev/null +++ b/tests/correct/element-content.inp @@ -0,0 +1 @@ +"'<>/&. \ No newline at end of file diff --git a/tests/correct/element-content.ixml b/tests/correct/element-content.ixml new file mode 100644 index 00000000..5b0fa6c1 --- /dev/null +++ b/tests/correct/element-content.ixml @@ -0,0 +1,2 @@ +content: -a, -".". +a: ~["."]*. diff --git a/tests/correct/element-content.output.xml b/tests/correct/element-content.output.xml new file mode 100644 index 00000000..bb02c891 --- /dev/null +++ b/tests/correct/element-content.output.xml @@ -0,0 +1 @@ +"'<>/& diff --git a/tests/correct/email.inp b/tests/correct/email.inp new file mode 100644 index 00000000..0ec46380 --- /dev/null +++ b/tests/correct/email.inp @@ -0,0 +1 @@ +~my_mail+{nospam}$?@sub-domain.example.info \ No newline at end of file diff --git a/tests/correct/email.ixml b/tests/correct/email.ixml new file mode 100644 index 00000000..9d48c04a --- /dev/null +++ b/tests/correct/email.ixml @@ -0,0 +1,8 @@ +email: user, -"@", host. +@user: atom+".". +-atom: char+. +@host: domain+".". +-domain: word+"-". +-word: letgit+. +-letgit: ["A"-"Z"; "a"-"z"; "0"-"9"]. +-char: letgit; ["!#$%&'*+-/=?^_`{|}~"]. diff --git a/tests/correct/email.output.xml b/tests/correct/email.output.xml new file mode 100644 index 00000000..39124898 --- /dev/null +++ b/tests/correct/email.output.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/correct/empty-group.inp b/tests/correct/empty-group.inp new file mode 100644 index 00000000..e5d8f44b --- /dev/null +++ b/tests/correct/empty-group.inp @@ -0,0 +1 @@ +bc \ No newline at end of file diff --git a/tests/correct/empty-group.ixml b/tests/correct/empty-group.ixml new file mode 100644 index 00000000..76f9bd31 --- /dev/null +++ b/tests/correct/empty-group.ixml @@ -0,0 +1,3 @@ +a: b, (), c. +b: "b". +c: "c". diff --git a/tests/correct/empty-group.output.xml b/tests/correct/empty-group.output.xml new file mode 100644 index 00000000..a8f89c4a --- /dev/null +++ b/tests/correct/empty-group.output.xml @@ -0,0 +1,4 @@ +
bc diff --git a/tests/correct/expr.inp b/tests/correct/expr.inp new file mode 100644 index 00000000..edfacdc7 --- /dev/null +++ b/tests/correct/expr.inp @@ -0,0 +1 @@ +pi+(10×b) \ No newline at end of file diff --git a/tests/correct/expr.ixml b/tests/correct/expr.ixml new file mode 100644 index 00000000..02d952b0 --- /dev/null +++ b/tests/correct/expr.ixml @@ -0,0 +1,15 @@ +expression: expr. +-expr: term; sum; diff. +sum: expr, -"+", term. +diff: expr, "-", term. +-term: factor; prod; div. +prod: term, -"×", factor. +div: term, "÷", factor. +-factor: id; number; bracketed. +bracketed: -"(", expr, -")". +id: @name. +name: letter+. +number: @value. +value: digit+. +-letter: ["a"-"z"]. +-digit: ["0"-"9"]. diff --git a/tests/correct/expr.output.xml b/tests/correct/expr.output.xml new file mode 100644 index 00000000..e2ce0f41 --- /dev/null +++ b/tests/correct/expr.output.xml @@ -0,0 +1,12 @@ + + diff --git a/tests/correct/expr1.inp b/tests/correct/expr1.inp new file mode 100644 index 00000000..31ebae67 --- /dev/null +++ b/tests/correct/expr1.inp @@ -0,0 +1 @@ +1+2+3*4+5 \ No newline at end of file diff --git a/tests/correct/expr1.ixml b/tests/correct/expr1.ixml new file mode 100644 index 00000000..a9a6393b --- /dev/null +++ b/tests/correct/expr1.ixml @@ -0,0 +1,13 @@ +expression: expr. +-expr: term+plusop. +@plusop: "+"; "-". +term: -factor; factor, mulop, factor+mulop. +@mulop: "*"; "/". +factor: id; number; bracketed. +bracketed: -"(", expr, -")". +id: @name. +name: letter+. +number: @value. +value: digit+. +-letter: ["a"-"z"]. +-digit: ["0"-"9"]. diff --git a/tests/correct/expr1.output.xml b/tests/correct/expr1.output.xml new file mode 100644 index 00000000..f7fe35b9 --- /dev/null +++ b/tests/correct/expr1.output.xml @@ -0,0 +1,19 @@ + diff --git a/tests/correct/expr2.inp b/tests/correct/expr2.inp new file mode 100644 index 00000000..0cae92a7 --- /dev/null +++ b/tests/correct/expr2.inp @@ -0,0 +1 @@ +234×(bbbb+cccc+dddd)×12 \ No newline at end of file diff --git a/tests/correct/expr2.ixml b/tests/correct/expr2.ixml new file mode 100644 index 00000000..449bc670 --- /dev/null +++ b/tests/correct/expr2.ixml @@ -0,0 +1,10 @@ +expression: expr. +-expr: term; sum; diff. +sum: term, "+", term+"+". +diff: term, "-", term+"-". +-term: factor; prod; div. +prod: factor, "×", factor+"×". +div: factor, "÷", factor+"÷". +-factor: name; number; "(", ^expr, ")". +name: ["a"-"z"]+. +number: ["0"-"9"]+. diff --git a/tests/correct/expr2.output.xml b/tests/correct/expr2.output.xml new file mode 100644 index 00000000..da1aa472 --- /dev/null +++ b/tests/correct/expr2.output.xml @@ -0,0 +1,14 @@ + +234×(bbbb+cccc+dddd12 diff --git a/tests/correct/expr3.inp b/tests/correct/expr3.inp new file mode 100644 index 00000000..0cae92a7 --- /dev/null +++ b/tests/correct/expr3.inp @@ -0,0 +1 @@ +234×(bbbb+cccc+dddd)×12 \ No newline at end of file diff --git a/tests/correct/expr3.ixml b/tests/correct/expr3.ixml new file mode 100644 index 00000000..4df1d6a7 --- /dev/null +++ b/tests/correct/expr3.ixml @@ -0,0 +1,11 @@ +expression: expr. +-expr: term; sum; diff. +sum: term, "+", term+"+". +diff: term, "-", term+"-". +-term: factor; prod; div. +prod: factor, "×", factor+"×". +div: factor, "÷", factor+"÷". +-factor: name; number; bracketed. +bracketed: "(", expr, ")". +name: ["a"-"z"]+. +number: ["0"-"9"]+. diff --git a/tests/correct/expr3.output.xml b/tests/correct/expr3.output.xml new file mode 100644 index 00000000..b933e8a0 --- /dev/null +++ b/tests/correct/expr3.output.xml @@ -0,0 +1,13 @@ + +234×(bbbb+cccc+dddd)×12 diff --git a/tests/correct/expr4.inp b/tests/correct/expr4.inp new file mode 100644 index 00000000..1b25bc35 --- /dev/null +++ b/tests/correct/expr4.inp @@ -0,0 +1 @@ +pi+(10×a×b)+3.14 \ No newline at end of file diff --git a/tests/correct/expr4.ixml b/tests/correct/expr4.ixml new file mode 100644 index 00000000..932eeb9b --- /dev/null +++ b/tests/correct/expr4.ixml @@ -0,0 +1,8 @@ +expression: expr. +-expr: term; sum. +sum: term, "+", term+"+". +-term: factor; prod. +prod: factor, "×", factor+"×". +-factor: id; number; "(", expr, ")". +id: ["a"-"z"]+. +number: ["0"-"9"]+, (".", ["0"-"9"]+)?. diff --git a/tests/correct/expr4.output.xml b/tests/correct/expr4.output.xml new file mode 100644 index 00000000..4eae9761 --- /dev/null +++ b/tests/correct/expr4.output.xml @@ -0,0 +1,12 @@ + +pi+(10×a×b)+3.14 diff --git a/tests/correct/expr5.inp b/tests/correct/expr5.inp new file mode 100644 index 00000000..5064edce --- /dev/null +++ b/tests/correct/expr5.inp @@ -0,0 +1 @@ +(3) \ No newline at end of file diff --git a/tests/correct/expr5.ixml b/tests/correct/expr5.ixml new file mode 100644 index 00000000..b1533f7a --- /dev/null +++ b/tests/correct/expr5.ixml @@ -0,0 +1,14 @@ +expression: expr. +-expr: term; sum; diff. +sum: expr, -"+", term. +diff: expr, "-", term. +-term: factor; prod; div. +prod: term, -"×", factor. +div: term, "÷", factor. +-factor: id; number; bracketed. +bracketed: -"(", expr, -")". +id: @name. +name: letter+. +number: digit+. +-letter: ["a"-"z"]. +-digit: ["0"-"9"]. diff --git a/tests/correct/expr5.output.xml b/tests/correct/expr5.output.xml new file mode 100644 index 00000000..859796e1 --- /dev/null +++ b/tests/correct/expr5.output.xml @@ -0,0 +1,6 @@ + +3 diff --git a/tests/correct/expr6.inp b/tests/correct/expr6.inp new file mode 100644 index 00000000..c5ecd84b --- /dev/null +++ b/tests/correct/expr6.inp @@ -0,0 +1 @@ +a^2+b×3+c^2 \ No newline at end of file diff --git a/tests/correct/expr6.ixml b/tests/correct/expr6.ixml new file mode 100644 index 00000000..3bc9fcb2 --- /dev/null +++ b/tests/correct/expr6.ixml @@ -0,0 +1,15 @@ +expression: expr. +-expr: term; sum; diff. +sum: term, "+", term+"+". +diff: term, "-", term+"-". +-term: power; prod; div. +prod: power, "×", power+"×". +div: power, "÷", power+"÷". +-power: fact; exp. +exp: fact, "^", fact+"^". +-fact: id; number; bracketed. +bracketed: "(", exp, ")". +id: letter+. +number: digit+. +-letter: ["a"-"z"]. +-digit: ['0'-'9']. diff --git a/tests/correct/expr6.output.xml b/tests/correct/expr6.output.xml new file mode 100644 index 00000000..ccc890c7 --- /dev/null +++ b/tests/correct/expr6.output.xml @@ -0,0 +1,16 @@ +a^2+b×3+c^2 diff --git a/tests/correct/hash.inp b/tests/correct/hash.inp new file mode 100644 index 00000000..150182ab --- /dev/null +++ b/tests/correct/hash.inp @@ -0,0 +1 @@ +#1 #12 #123 #1234 #12345 #123456 #1. \ No newline at end of file diff --git a/tests/correct/hash.ixml b/tests/correct/hash.ixml new file mode 100644 index 00000000..3a057ae0 --- /dev/null +++ b/tests/correct/hash.ixml @@ -0,0 +1,5 @@ +hashes: hash*S, ".". +hash: "#", d6. +@d6: d, (d, (d, (d, (d, d?)?)?)?)?. +-d: ["0"-"9"]. +-S: " "+. diff --git a/tests/correct/hash.output.xml b/tests/correct/hash.output.xml new file mode 100644 index 00000000..9f57f7e3 --- /dev/null +++ b/tests/correct/hash.output.xml @@ -0,0 +1,8 @@ +# # # # # # #. diff --git a/tests/correct/hex.inp b/tests/correct/hex.inp new file mode 100644 index 00000000..9eb1507c --- /dev/null +++ b/tests/correct/hex.inp @@ -0,0 +1 @@ +a b \ No newline at end of file diff --git a/tests/correct/hex.ixml b/tests/correct/hex.ixml new file mode 100644 index 00000000..a89f2e15 --- /dev/null +++ b/tests/correct/hex.ixml @@ -0,0 +1 @@ +hex: "a", [#20], "b". diff --git a/tests/correct/hex.output.xml b/tests/correct/hex.output.xml new file mode 100644 index 00000000..08165f2e --- /dev/null +++ b/tests/correct/hex.output.xml @@ -0,0 +1,2 @@ + +a b diff --git a/tests/correct/hex1.inp b/tests/correct/hex1.inp new file mode 100644 index 00000000..9eb1507c --- /dev/null +++ b/tests/correct/hex1.inp @@ -0,0 +1 @@ +a b \ No newline at end of file diff --git a/tests/correct/hex1.ixml b/tests/correct/hex1.ixml new file mode 100644 index 00000000..dc0ea049 --- /dev/null +++ b/tests/correct/hex1.ixml @@ -0,0 +1 @@ +hex: "a", #20, "b". diff --git a/tests/correct/hex1.output.xml b/tests/correct/hex1.output.xml new file mode 100644 index 00000000..08165f2e --- /dev/null +++ b/tests/correct/hex1.output.xml @@ -0,0 +1,2 @@ + +a b diff --git a/tests/correct/hex3.inp b/tests/correct/hex3.inp new file mode 100644 index 00000000..e8a71bef --- /dev/null +++ b/tests/correct/hex3.inp @@ -0,0 +1 @@ +a!b \ No newline at end of file diff --git a/tests/correct/hex3.ixml b/tests/correct/hex3.ixml new file mode 100644 index 00000000..847fbd36 --- /dev/null +++ b/tests/correct/hex3.ixml @@ -0,0 +1 @@ +hex: "a", [#1-#7e], "b". diff --git a/tests/correct/hex3.output.xml b/tests/correct/hex3.output.xml new file mode 100644 index 00000000..11e6b6f7 --- /dev/null +++ b/tests/correct/hex3.output.xml @@ -0,0 +1,2 @@ + +a!b diff --git a/tests/correct/json.inp b/tests/correct/json.inp new file mode 100644 index 00000000..696a9676 --- /dev/null +++ b/tests/correct/json.inp @@ -0,0 +1,18 @@ +{"menu": { + "id": "file", + "value": "File", + "popup": { + "menuitem": [ + {"value": "New", "onclick": "CreateNewDoc()"}, + {"value": "Open", "onclick": "OpenDoc()"}, + {"value": "Close", "onclick": "CloseDoc()"} + ] + } + }, + "number": -0.0e+00, + "string": "\uffff", + "bool": true, + "also": false, + "no": null +} + diff --git a/tests/correct/json.ixml b/tests/correct/json.ixml new file mode 100644 index 00000000..a30ab870 --- /dev/null +++ b/tests/correct/json.ixml @@ -0,0 +1,29 @@ +json: element. +element: value. +-value: string; + number; + object; + array; + "true", S; + "false", S; + "null", S. +object: "{", S, members, "}", S. +-members: member*(",", S). +member: @string, S, ":", S, element. +array: "[", S, elements, "]", S. +elements: element*(",", S). +string: -'"', character*, -'"'. +-character: ~['"\'; #0-#19]; + "\", escape. +escape: ['"\/bfnrt']; + "u", hex, hex, hex, hex. +hex: digit; ["A"-"F"; "a"-"f"]. +number: int, frac, exp. +int: "-"?, digit; + "-"?, onenine, digit*. +digit: ["0"-"9"]. +onenine: ["1"-"9"]. +frac: (".", digit+)?. +exp: (["eE"], sign, digit+)?. +sign: ["+-"]?. +-S: -[#9; #a; #d; " "]*. diff --git a/tests/correct/json.output.xml b/tests/correct/json.output.xml new file mode 100644 index 00000000..923b9fe8 --- /dev/null +++ b/tests/correct/json.output.xml @@ -0,0 +1,108 @@ + +{:{:file,:File,:{:[{:New,:CreateNewDoc()},{:Open,:OpenDoc()},{:Close,:CloseDoc()}]}},:-0.0e+00,:\uffff,:true,:false,:null} diff --git a/tests/correct/json1.inp b/tests/correct/json1.inp new file mode 100644 index 00000000..c4d32ef5 --- /dev/null +++ b/tests/correct/json1.inp @@ -0,0 +1 @@ +{"name": "pi", "value": 3.145926} \ No newline at end of file diff --git a/tests/correct/json1.ixml b/tests/correct/json1.ixml new file mode 100644 index 00000000..190e3e97 --- /dev/null +++ b/tests/correct/json1.ixml @@ -0,0 +1,19 @@ +json: S, object. +object: "{", S, members, "}", S. +-members: pair*(",", S). +pair: @string, S, ":", S, value. +array: "[", S, value*(",", S), "]", S. +-value: string, S; number, S; object; array; "true", S; "false", S; "null", S. +string: -"""", char*, -"""". +-char: ~['"'; "\" {;[#0-#1F];}]; '\', ('"'; "\"; "/"; "b"; "f"; "n"; "r"; "t"; "u", hexdigits). +number: "-"?, int, frac?, exp?. +-int: "0"; digit19, digit*. +-frac: ".", digit+. +-exp: ("e"; "E"), sign?, digit+. +-sign: "+"; "-". +-S: " "*. +-digit: ["0"-"9"]. +-digit19: ["1"-"9"]. +-hexdigits: hexdigit, hexdigit, hexdigit, hexdigit. +-hexdigit: digit; ["a"-"f"]; ["A"-"F"]. + diff --git a/tests/correct/json1.output.xml b/tests/correct/json1.output.xml new file mode 100644 index 00000000..b62df49c --- /dev/null +++ b/tests/correct/json1.output.xml @@ -0,0 +1,10 @@ + +{: pi, : 3.145926} diff --git a/tests/correct/lf.inp b/tests/correct/lf.inp new file mode 100644 index 00000000..f4f31b7e --- /dev/null +++ b/tests/correct/lf.inp @@ -0,0 +1,3 @@ +Now is the time +For all good people +To have fun. \ No newline at end of file diff --git a/tests/correct/lf.ixml b/tests/correct/lf.ixml new file mode 100644 index 00000000..0512a073 --- /dev/null +++ b/tests/correct/lf.ixml @@ -0,0 +1,3 @@ +input: line+lf. +line: ~[#a]*. +lf: -#a. diff --git a/tests/correct/lf.output.xml b/tests/correct/lf.output.xml new file mode 100644 index 00000000..cc1d38ff --- /dev/null +++ b/tests/correct/lf.output.xml @@ -0,0 +1,7 @@ +Now is the timeFor all good peopleTo have fun. diff --git a/tests/correct/marked.inp b/tests/correct/marked.inp new file mode 100644 index 00000000..5931a55d --- /dev/null +++ b/tests/correct/marked.inp @@ -0,0 +1 @@ +!d \ No newline at end of file diff --git a/tests/correct/marked.ixml b/tests/correct/marked.ixml new file mode 100644 index 00000000..e97cd097 --- /dev/null +++ b/tests/correct/marked.ixml @@ -0,0 +1,3 @@ +a: @b; "!", c. +b: c. +c: "d". \ No newline at end of file diff --git a/tests/correct/marked.output.xml b/tests/correct/marked.output.xml new file mode 100644 index 00000000..0ca696f7 --- /dev/null +++ b/tests/correct/marked.output.xml @@ -0,0 +1 @@ +!d diff --git a/tests/correct/nested-comment.inp b/tests/correct/nested-comment.inp new file mode 100644 index 00000000..63d8dbd4 --- /dev/null +++ b/tests/correct/nested-comment.inp @@ -0,0 +1 @@ +b \ No newline at end of file diff --git a/tests/correct/nested-comment.ixml b/tests/correct/nested-comment.ixml new file mode 100644 index 00000000..aab5c763 --- /dev/null +++ b/tests/correct/nested-comment.ixml @@ -0,0 +1,6 @@ +a: b, c. +b: "b". {here is a comment +{with a nested comment +b: "c". +}} +c: . diff --git a/tests/correct/nested-comment.output.xml b/tests/correct/nested-comment.output.xml new file mode 100644 index 00000000..8abbc802 --- /dev/null +++ b/tests/correct/nested-comment.output.xml @@ -0,0 +1,5 @@ + +b diff --git a/tests/correct/para-test.inp b/tests/correct/para-test.inp new file mode 100644 index 00000000..e5aec0ff --- /dev/null +++ b/tests/correct/para-test.inp @@ -0,0 +1,8 @@ +AB December 2021 +Para entry. + +Here is annother section +See. + +And another. +The end. diff --git a/tests/correct/para-test.ixml b/tests/correct/para-test.ixml new file mode 100644 index 00000000..0838cd15 --- /dev/null +++ b/tests/correct/para-test.ixml @@ -0,0 +1,5 @@ +section: para, lf. +para: line+. +line: ~[#a]+, lf. +lf: -#a. + diff --git a/tests/correct/poly.inp b/tests/correct/poly.inp new file mode 100644 index 00000000..a2f46941 --- /dev/null +++ b/tests/correct/poly.inp @@ -0,0 +1 @@ +44x⁷⁸⁹+13x²-20x+1 \ No newline at end of file diff --git a/tests/correct/poly.ixml b/tests/correct/poly.ixml new file mode 100644 index 00000000..fb57f6ce --- /dev/null +++ b/tests/correct/poly.ixml @@ -0,0 +1,9 @@ +polynomial: f, (plus; minus)*. {4x2+3x-2} +plus: -"+", -f. +minus: -"-", -f. +@power: exp. +@constant: @n. +-f: @n, -"x", @power?; constant. +n: ["0"-"9"]+. +exp: ("⁰"; "¹"; "²";"³"; "⁴"; "⁵"; "⁶"; "⁷"; "⁸"; "⁹")+. + diff --git a/tests/correct/poly.output.xml b/tests/correct/poly.output.xml new file mode 100644 index 00000000..691eaf86 --- /dev/null +++ b/tests/correct/poly.output.xml @@ -0,0 +1,6 @@ + + diff --git a/tests/correct/program.inp b/tests/correct/program.inp new file mode 100644 index 00000000..6908c458 --- /dev/null +++ b/tests/correct/program.inp @@ -0,0 +1 @@ +{a=0;f(a, 0);} \ No newline at end of file diff --git a/tests/correct/program.ixml b/tests/correct/program.ixml new file mode 100644 index 00000000..5c9f47f6 --- /dev/null +++ b/tests/correct/program.ixml @@ -0,0 +1,17 @@ +program: block. +block: "{", S, statement*(";", S), "}", S. +statement: if-statement; while-statement; assignment; call; block; . +if-statement: "if", S, condition, "then", S, statement, else-part?. +else-part: "else", S, statement. +while-statement: "while", S, condition, "do", S, statement. +assignment: variable, "=", S, expression. +variable: identifier. +call: identifier, "(", S, parameter*(",", S), ")", S. +parameter: -expression. +identifier: letter+, S. +expression: identifier; number. +number: digit+, S. +-letter: ["a"-"z"]; ["A"-"Z"]. +-digit: ["0"-"9"]. +condition: identifier. +-S: " "*. diff --git a/tests/correct/program.output.xml b/tests/correct/program.output.xml new file mode 100644 index 00000000..493aefa4 --- /dev/null +++ b/tests/correct/program.output.xml @@ -0,0 +1,25 @@ + +{a=0;f(a, 0);} diff --git a/tests/correct/range-comments.inp b/tests/correct/range-comments.inp new file mode 100644 index 00000000..934edc81 --- /dev/null +++ b/tests/correct/range-comments.inp @@ -0,0 +1 @@ +name \ No newline at end of file diff --git a/tests/correct/range-comments.ixml b/tests/correct/range-comments.ixml new file mode 100644 index 00000000..3ccff1e1 --- /dev/null +++ b/tests/correct/range-comments.ixml @@ -0,0 +1,2 @@ +name: letter*. +letter: [{comment}"a"{comment}-{comment}"z"{comment}]. diff --git a/tests/correct/range-comments.output.xml b/tests/correct/range-comments.output.xml new file mode 100644 index 00000000..1974d0f3 --- /dev/null +++ b/tests/correct/range-comments.output.xml @@ -0,0 +1,6 @@ +name diff --git a/tests/correct/range.inp b/tests/correct/range.inp new file mode 100644 index 00000000..1d470f6d --- /dev/null +++ b/tests/correct/range.inp @@ -0,0 +1 @@ +5 . diff --git a/tests/correct/range.ixml b/tests/correct/range.ixml new file mode 100644 index 00000000..aafbb68e --- /dev/null +++ b/tests/correct/range.ixml @@ -0,0 +1,3 @@ +data: range1, range2, -".". +range1: ["0"-"9"]. +range2: [#0-#9]. diff --git a/tests/correct/range.output.xml b/tests/correct/range.output.xml new file mode 100644 index 00000000..6d4e4039 --- /dev/null +++ b/tests/correct/range.output.xml @@ -0,0 +1,4 @@ +5 diff --git a/tests/correct/ranges.inp b/tests/correct/ranges.inp new file mode 100644 index 00000000..0ace0494 --- /dev/null +++ b/tests/correct/ranges.inp @@ -0,0 +1 @@ + !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ diff --git a/tests/correct/ranges.ixml b/tests/correct/ranges.ixml new file mode 100644 index 00000000..9d61f0ce --- /dev/null +++ b/tests/correct/ranges.ixml @@ -0,0 +1,15 @@ +test: other, digit, other, cap, other, lc, other, lf?. +other: ~[#a; #30-#39; #41-#5A; #61-#7A]+. +cap: [#41-#5A]+. +lc: [#61-#7A]+. +digit: [#30-#39]+. +-lf: -#a. +{ +0123456789abcdef + !"#$%&'()*+,-./ +0123456789:;<=>? +@ABCDEFGHIJKLMNO +PQRSTUVWXYZ[\]^_ +`abcdefghijklmno +pqrstuvwxyz{|}~ +} \ No newline at end of file diff --git a/tests/correct/ranges.output.xml b/tests/correct/ranges.output.xml new file mode 100644 index 00000000..4d653246 --- /dev/null +++ b/tests/correct/ranges.output.xml @@ -0,0 +1,9 @@ + !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ diff --git a/tests/correct/ranges1.inp b/tests/correct/ranges1.inp new file mode 100644 index 00000000..effeefbc --- /dev/null +++ b/tests/correct/ranges1.inp @@ -0,0 +1 @@ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ \ No newline at end of file diff --git a/tests/correct/ranges1.ixml b/tests/correct/ranges1.ixml new file mode 100644 index 00000000..5fc72892 --- /dev/null +++ b/tests/correct/ranges1.ixml @@ -0,0 +1,20 @@ +chars: punctuation, nonpunct?; upper, nonupper?; lower, nonlower?; digit, nondigit?. +-nonpunct: upper, nonupper?; lower, nonlower?; digit, nondigit?. +-nonupper: punctuation, nonpunct?; lower, nonlower?; digit, nondigit?. +-nonlower: punctuation, nonpunct?; upper, nonupper?; digit, nondigit?. +-nondigit: punctuation, nonpunct?; upper, nonupper?; lower, nonlower?. + +upper: [#41-#5A]+. +lower: [#61-#7A]+. +digit: [#30-#39]+. +punctuation: ~[#a; #30-#39; #41-#5A; #61-#7A]+. +{-lf: -#a.} +{ +0123456789abcdef + !"#$%&'()*+,-./ +0123456789:;<=>? +@ABCDEFGHIJKLMNO +PQRSTUVWXYZ[\]^_ +`abcdefghijklmno +pqrstuvwxyz{|}~ +} \ No newline at end of file diff --git a/tests/correct/ranges1.output.xml b/tests/correct/ranges1.output.xml new file mode 100644 index 00000000..d6c8ee7e --- /dev/null +++ b/tests/correct/ranges1.output.xml @@ -0,0 +1,9 @@ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ diff --git a/tests/correct/string.inp b/tests/correct/string.inp new file mode 100644 index 00000000..1c50dcea --- /dev/null +++ b/tests/correct/string.inp @@ -0,0 +1 @@ +aaa.bbb.ccc. \ No newline at end of file diff --git a/tests/correct/string.ixml b/tests/correct/string.ixml new file mode 100644 index 00000000..8dc014e5 --- /dev/null +++ b/tests/correct/string.ixml @@ -0,0 +1,5 @@ +S : @able, baker, @charlie. +able: string. +baker: string. +charlie: string. +string: ["abc"]*, ".". diff --git a/tests/correct/string.output.xml b/tests/correct/string.output.xml new file mode 100644 index 00000000..f38d4450 --- /dev/null +++ b/tests/correct/string.output.xml @@ -0,0 +1,6 @@ + +bbb. diff --git a/tests/correct/tab.inp b/tests/correct/tab.inp new file mode 100644 index 00000000..5dc8616e --- /dev/null +++ b/tests/correct/tab.inp @@ -0,0 +1 @@ + abc \ No newline at end of file diff --git a/tests/correct/tab.ixml b/tests/correct/tab.ixml new file mode 100644 index 00000000..48149f1f --- /dev/null +++ b/tests/correct/tab.ixml @@ -0,0 +1,3 @@ +data: s?, word. +-s: [#20; #9]+. +word: [L]+. diff --git a/tests/correct/tab.output.xml b/tests/correct/tab.output.xml new file mode 100644 index 00000000..bbf7eb95 --- /dev/null +++ b/tests/correct/tab.output.xml @@ -0,0 +1,4 @@ + + abc diff --git a/tests/correct/test-catalog.xml b/tests/correct/test-catalog.xml new file mode 100644 index 00000000..ca0b864c --- /dev/null +++ b/tests/correct/test-catalog.xml @@ -0,0 +1,756 @@ + + + +

Tests provided by Steven Pemberton in December 2021, + with corrections of 21 December. Reorganized by Norm Tovey-Walsh, February 2022.

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

We may need a new assertion: if taken literally this + grammar produces non-XML output (multiple occurrences of an + attribute on the same element). +

+

For now I'll class it as a run-time error in the + grammar. But is it an error in the grammar?

+

To be discussed, probably at length.

+
+ + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

The grammar as written allows only one section; the input + has several. The error was probably in the formulation of + the grammar, but the simplest change to bring it into a + consistent state is to change the result.

+

For the record, the originally asserted result is now in + file 'para-test.disputed-output.xml'.

+
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Possible problem: range [#0 - #9] + includes non-XML characters. Legal?

+
+ + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

The input is a[.!=''] but the grammar does + not provide for predicates. So although the input is a + perfectly fine XPath expression, it is not a sentence in the + language defined by the grammar specified for the test.

+
+ + + + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ diff --git a/tests/correct/test.inp b/tests/correct/test.inp new file mode 100644 index 00000000..a96aa0ea --- /dev/null +++ b/tests/correct/test.inp @@ -0,0 +1 @@ +.. \ No newline at end of file diff --git a/tests/correct/test.ixml b/tests/correct/test.ixml new file mode 100644 index 00000000..f05624d9 --- /dev/null +++ b/tests/correct/test.ixml @@ -0,0 +1,4 @@ +test: foo, bar. +foo: -".". +bar: ".". + diff --git a/tests/correct/test.output.xml b/tests/correct/test.output.xml new file mode 100644 index 00000000..b2b1a7e6 --- /dev/null +++ b/tests/correct/test.output.xml @@ -0,0 +1,5 @@ + +. diff --git a/tests/correct/unicode-range.inp b/tests/correct/unicode-range.inp new file mode 100644 index 00000000..094793f6 --- /dev/null +++ b/tests/correct/unicode-range.inp @@ -0,0 +1 @@ +ŤĤıŞıŞÀŤėšť \ No newline at end of file diff --git a/tests/correct/unicode-range.ixml b/tests/correct/unicode-range.ixml new file mode 100644 index 00000000..c5546dd9 --- /dev/null +++ b/tests/correct/unicode-range.ixml @@ -0,0 +1 @@ +latin: ["À"-"ž"]+. diff --git a/tests/correct/unicode-range.output.xml b/tests/correct/unicode-range.output.xml new file mode 100644 index 00000000..bb5b2a3b --- /dev/null +++ b/tests/correct/unicode-range.output.xml @@ -0,0 +1 @@ +ŤĤıŞıŞÀŤėšť diff --git a/tests/correct/unicode-range1.inp b/tests/correct/unicode-range1.inp new file mode 100644 index 00000000..89266a2b --- /dev/null +++ b/tests/correct/unicode-range1.inp @@ -0,0 +1 @@ +¡¢£¤¥¦§¨©«¬®¯°±²³´µ¶·¸¹»¼½¾¿×÷ \ No newline at end of file diff --git a/tests/correct/unicode-range1.ixml b/tests/correct/unicode-range1.ixml new file mode 100644 index 00000000..fc7f47c2 --- /dev/null +++ b/tests/correct/unicode-range1.ixml @@ -0,0 +1 @@ +chars: [#1-"÷"]+. diff --git a/tests/correct/unicode-range1.output.xml b/tests/correct/unicode-range1.output.xml new file mode 100644 index 00000000..7f2277e0 --- /dev/null +++ b/tests/correct/unicode-range1.output.xml @@ -0,0 +1 @@ +¡¢£¤¥¦§¨©«¬®¯°±²³´µ¶·¸¹»¼½¾¿×÷ diff --git a/tests/correct/unicode-range2.inp b/tests/correct/unicode-range2.inp new file mode 100644 index 00000000..f2ba8f84 --- /dev/null +++ b/tests/correct/unicode-range2.inp @@ -0,0 +1 @@ +abc \ No newline at end of file diff --git a/tests/correct/unicode-range2.ixml b/tests/correct/unicode-range2.ixml new file mode 100644 index 00000000..a5a5b9d3 --- /dev/null +++ b/tests/correct/unicode-range2.ixml @@ -0,0 +1 @@ +chars: [#60-#70]+. diff --git a/tests/correct/unicode-range2.output.xml b/tests/correct/unicode-range2.output.xml new file mode 100644 index 00000000..d3fc11f9 --- /dev/null +++ b/tests/correct/unicode-range2.output.xml @@ -0,0 +1 @@ +abc diff --git a/tests/correct/vcard.inp b/tests/correct/vcard.inp new file mode 100644 index 00000000..746ca436 --- /dev/null +++ b/tests/correct/vcard.inp @@ -0,0 +1,10 @@ +BEGIN:VCARD +VERSION:3.0 +N:Lastname;Surname +FN:Displayname +ORG:EVenX +URL:http://www.evenx.com/ +EMAIL:info@evenx.com +TEL;TYPE=voice,work,pref:+49 1234 56788 +ADR;TYPE=intl,work,postal,parcel:;;Wallstr. 1;Berlin;;12345;Germany +END:VCARD diff --git a/tests/correct/vcard.ixml b/tests/correct/vcard.ixml new file mode 100644 index 00000000..3e7f10fe --- /dev/null +++ b/tests/correct/vcard.ixml @@ -0,0 +1,11 @@ +card: -"BEGIN:", name, eoln, property+, -"END:", endname, eoln. +property: name, parameters, -":", attribute+-";", -eoln. +parameters: (-";", parameter)*. +parameter: name, -"=", pvalue. +@pvalue: ~[";:"; #a]+. +attribute: value. +@value: achar*. +-achar: ~["#;"; #a]. +@name: ["a"-"z"; "A"-"Z"]+. +@endname: name. +-eoln: -#a. diff --git a/tests/correct/vcard.output.xml b/tests/correct/vcard.output.xml new file mode 100644 index 00000000..da216fb8 --- /dev/null +++ b/tests/correct/vcard.output.xml @@ -0,0 +1,45 @@ + diff --git a/tests/correct/xml.inp b/tests/correct/xml.inp new file mode 100644 index 00000000..15fb92b1 --- /dev/null +++ b/tests/correct/xml.inp @@ -0,0 +1,3 @@ + + Now is the time for stuff. + \ No newline at end of file diff --git a/tests/correct/xml.ixml b/tests/correct/xml.ixml new file mode 100644 index 00000000..8a33aa9c --- /dev/null +++ b/tests/correct/xml.ixml @@ -0,0 +1,11 @@ +xml: element. +element: -"<", name, (" "+, attribute)*, (-">", content, -""; -"/>"). +@name: ["a"-"z"; "A"-"Z"]+. +@close: name. +attribute: name, -"=", value. +@value: -'"', dchar*, -'"'; -"'", schar*, -"'". +content: (cchar; element)*. +-dchar: ~['"']. +-schar: ~["'"]. +-cchar: ~["<"]. + diff --git a/tests/correct/xml.output.xml b/tests/correct/xml.output.xml new file mode 100644 index 00000000..3dcca91c --- /dev/null +++ b/tests/correct/xml.output.xml @@ -0,0 +1,12 @@ + + + Now is the time for stuff. + diff --git a/tests/correct/xml1.inp b/tests/correct/xml1.inp new file mode 100644 index 00000000..360c58cf --- /dev/null +++ b/tests/correct/xml1.inp @@ -0,0 +1,3 @@ + + This is a test. + \ No newline at end of file diff --git a/tests/correct/xml1.ixml b/tests/correct/xml1.ixml new file mode 100644 index 00000000..5fae05e3 --- /dev/null +++ b/tests/correct/xml1.ixml @@ -0,0 +1,10 @@ +xml: element. +element: -"<", name, (-" "+, attribute)*, (-">", content, -""; -"/>"). +@name: ["a"-"z"; "A"-"Z"]+. +@close: name. +attribute: name, -"=", value. +@value: -'"', dchar*, -'"'; -"'", schar*, -"'". +content: (cchar; element)*. +-dchar: ~['"'; "<"]. +-schar: ~["'"; "<"]. +-cchar: ~["<"]. diff --git a/tests/correct/xml1.output.xml b/tests/correct/xml1.output.xml new file mode 100644 index 00000000..a9dd29aa --- /dev/null +++ b/tests/correct/xml1.output.xml @@ -0,0 +1,12 @@ + + + This is a test. + diff --git a/tests/correct/xpath.inp b/tests/correct/xpath.inp new file mode 100644 index 00000000..6217304f --- /dev/null +++ b/tests/correct/xpath.inp @@ -0,0 +1 @@ +a[.!=''] \ No newline at end of file diff --git a/tests/correct/xpath.ixml b/tests/correct/xpath.ixml new file mode 100644 index 00000000..c68e21e7 --- /dev/null +++ b/tests/correct/xpath.ixml @@ -0,0 +1,260 @@ +XPath: Expr. + +ParamList: Param, ( ',', Param )*. +Param: '$', EQName, TypeDeclaration?. +FunctionBody: EnclosedExpr. +EnclosedExpr: '{', Expr?, '}'. + +Expr: ExprSingle, ( s?,',',s?, ExprSingle )*. + +-ExprSingle: OrExpr; OrExprSingle. + + +-OrExprSingle: AndExpr. +OrExpr: AndExpr, (s?, 'or', s?, AndExpr )+. +AndExpr: ComparisonExpr, (s?, 'and', s?, ComparisonExpr )*. +ComparisonExpr: StringConcatExpr, (s, ( @ValueComp ; @GeneralComp ;@ NodeComp ), s, StringConcatExpr )?. +StringConcatExpr: RangeExpr, (s?, '||', s?, RangeExpr )*. +RangeExpr: AdditiveExpr, ( s, 'to', s, AdditiveExpr )?. +AdditiveExpr: MultiplicativeExpr; + MultiplicativeExpr, s?, @AddOp, s?, AdditiveExpr. +@AddOp: ( '+' ; '-' ). +MultiplicativeExpr: UnionExpr; + UnionExpr, s?, @MultOp, s?, MultiplicativeExpr. +@MultOp: '*' ; 'div' ; 'idiv' ; 'mod'. +UnionExpr: IntersectExceptExpr, ( ( 'union' ; '|' ), IntersectExceptExpr )*. +IntersectExceptExpr: InstanceofExpr, ( s, ( 'intersect' ;'except' ), s, InstanceofExpr )*. +InstanceofExpr: TreatExpr, ( s, 'instance', s, 'of', s, @SequenceType )?. +TreatExpr: CastExpr, ( s, -'treat', s, -'as', s, @SequenceType )?. +CastExpr: ArrowExpr, ( s, -'cast', s,-'as', s, @SingleType )?. +ArrowExpr: UnaryExpr, ( '=>', ArrowFunctionSpecifier, ArgumentList )*. + + +UnaryExpr: ( '-' ; '+' )*, ValueExpr. +-ValueExpr: SimpleMapExpr. +GeneralComp: '='; '!='; '<'; '<='; '>'; '>='. +ValueComp: 'eq'; 'ne'; 'lt'; 'le'; 'gt'; 'ge'. +NodeComp: 'is'; '<<'; '>>'. +SimpleMapExpr: PathExpr, ( '!', PathExpr )*. +PathExpr: '/', ( RelativePathExpr ); + '//', RelativePathExpr; + RelativePathExpr. +RelativePathExpr: StepExpr, ( ( '/'; '//' ), StepExpr )*. + +-StepExpr: PostfixExpr; AxisStep. +-AxisStep: ( ReverseStep;ForwardStep ). +ForwardStep: @ForwardAxis, NodeTest; + AbbrevForwardStep. +ForwardAxis: 'child::'; + 'descendant::'; + 'attribute::'; + 'self::'; + 'descendant-or-self::'; + 'following-sibling::'; + 'following::'; + 'namespace::'. +AbbrevForwardStep: '@'?, QName. +ReverseStep: @ReverseAxis, NodeTest; + @AbbrevReverseStep. +ReverseAxis: 'parent::'; + 'ancestor::'; + 'preceding-sibling::'; + 'preceding::'; + 'ancestor-or-self::'. +AbbrevReverseStep: '..'. + + +-NodeTest: KindTest; NameTest. +-NameTest: EQName; Wildcard. + +PostfixExpr: PrimaryExpr. +ArgumentList: -'(',s?, ( -Argument,(s?, -',', s?, -Argument )* )?, s?, -')'. + +KeySpecifier: NCName;| IntegerLiteral; ParenthesizedExpr; '*'. +-ArrowFunctionSpecifier: EQName; VarRef; ParenthesizedExpr. + +-PrimaryExpr: Literal; + VarRef; + ParenthesizedExpr; + ContextItemExpr; + FunctionCall; + FunctionItemExpr; + MapConstructor; + ArrayConstructor; + UnaryLookup. +-Literal: NumericLiteral; StringLiteral. +-NumericLiteral: IntegerLiteral; DecimalLiteral;| DoubleLiteral. +VarRef: -'$', @VarName. +VarName: EQName. +ParenthesizedExpr: -'(', Expr?, -')'. +ContextItemExpr: -'.'. +FunctionCall: @FunctionEQName, -ArgumentList. +-Argument: ExprSingle; ArgumentPlaceholder. +ArgumentPlaceholder: -'?'. +FunctionItemExpr: NamedFunctionRef; InlineFunctionExpr. +NamedFunctionRef: FunctionEQName, '#', IntegerLiteral. +InlineFunctionExpr: -'function', -'(', ParamList?, ')', ( s, 'as', s, SequenceType )?, FunctionBody. +MapConstructor: -'map', -'{', ( MapConstructorEntry, ( ',', MapConstructorEntry )* )?, -'}'. +MapConstructorEntry: MapKeyExpr, -':', MapValueExpr. +MapKeyExpr: ExprSingle. +MapValueExpr: ExprSingle. +ArrayConstructor: SquareArrayConstructor; CurlyArrayConstructor. +SquareArrayConstructor: -'[', ( ExprSingle, ( -',', ExprSingle )* )?, -']'. +CurlyArrayConstructor: -'array', '{', Expr?, -'}'. +UnaryLookup: -'?', KeySpecifier. + +SingleType: SimpleTypeName, '?'?. +TypeDeclaration: 'as', SequenceType. +SequenceType: 'empty-sequence()'; + ItemType, OccurrenceIndicator?. +@OccurrenceIndicator: '?'; '*'; '+'. +ItemType: KindTest; + 'item()'; + FunctionTest; + MapTest; + ArrayTest; + AtomicOrUnionType; + ParenthesizedItemType. +AtomicOrUnionType: EQName. + + +-KindTest: DocumentTest; + ElementTest; + AttributeTest; + SchemaElementTest; + SchemaAttributeTest; + PITest; + CommentTest; + TextTest; + NamespaceNodeTest; + AnyKindTest. +AnyKindTest: 'node()'. +DocumentTest: -'document-node(' , ( ElementTest ; SchemaElementTest )?, -')'. +TextTest: -'text()'. +CommentTest: -'comment()'. +NamespaceNodeTest: -'namespace-node()'. +PITest: -'processing-instruction(' , ( NCName ; StringLiteral )?, -')'. +AttributeTest: -'attribute(' ,( AttribNameOrWildcard, ( ',', TypeName )? )?, -')'. +AttribNameOrWildcard: AttributeName; '*'. +SchemaAttributeTest: -'schema-attribute(', AttributeDeclaration, -')'. +AttributeDeclaration: AttributeName. +ElementTest: -'element(' , ( ElementNameOrWildcard, ( ',', TypeName, '?'? )? )?, -')'. +ElementNameOrWildcard: ElementName;'*'. +SchemaElementTest: -'schema-element(', ElementDeclaration, -')'. +ElementDeclaration: ElementName. + +AttributeName: EQName. +ElementName: EQName. +SimpleTypeName: EQName. +TypeName: EQName. + +FunctionTest: AnyFunctionTest; TypedFunctionTest. +AnyFunctionTest: -'function(*)'. +TypedFunctionTest: -'function(', ( SequenceType, ( -',', SequenceType )* )?, -')', s, 'as', s, SequenceType. +MapTest: AnyMapTest; TypedMapTest. +AnyMapTest: -'map(*)'. +TypedMapTest: -'map(', s, AtomicOrUnionType,s, -',', s, SequenceType, s, -')'. +ArrayTest: AnyArrayTest; TypedArrayTest. +AnyArrayTest: -'array(*)'. +TypedArrayTest: -'array(', s, SequenceType, s, -')'. +ParenthesizedItemType: -'(',s, ItemType,s, -')'. + +FunctionEQName: FunctionName; URIQualifiedName. +EQName: QName; URIQualifiedName. + +QName: FunctionName; + 'array'; + 'attribute'; + 'comment'; + 'document-node'; + 'element'; + 'empty-sequence'; + 'function'; + 'if'; + 'item'; + 'map'; + 'namespace-node'; + 'node'; + 'processing-instruction'; + 'schema-attribute'; + 'schema-element'; + 'switch'; + 'text'; + 'typeswitch'. + + -FunctionName: QNameToken; + 'ancestor'; + 'ancestor-or-self'; + 'and'; + 'cast'; + 'castable'; + 'child'; + 'descendant'; + 'descendant-or-self'; + 'div'; + 'else'; + 'eq'; + 'every'; + 'except'; + 'following'; + 'following-sibling'; + 'for'; + 'ge'; + 'gt'; + 'idiv'; + 'instance'; + 'intersect'; + 'is'; + 'le'; + 'let'; + 'lt'; + 'mod'; + 'namespace'; + 'ne'; + 'or'; + 'parent'; + 'preceding'; + 'preceding-sibling'; + 'return'; + 'satisfies'; + 'self'; + 'some'; + 'to'; + 'treat'; + 'union'. + +StringLiteral: -'"', ( EscapeQuot; ~['"'] )*, -'"'; + -"'", ( EscapeApos ; ~["'"] )*, -"'". +IntegerLiteral: -Digits. +DecimalLiteral: '.', -Digits; + -Digits, '.', ['0'-'9']*. +DoubleLiteral: ( '.', -Digits ; -Digits, ( '.', ['0'-'9']* )? ), ['e'; 'E'], -Digits. + +-URIQualifiedName: BracedURILiteral, NCName. +BracedURILiteral: 'Q', '{', ~['{';'}']*, '}'. +EscapeQuot: '""'. +EscapeApos: "''". + +QNameToken: PrefixedName; UnprefixedName. +PrefixedName: @Prefix, -':', @LocalPart. +UnprefixedName: LocalPart. +Prefix: NCName. +LocalPart: NCName. +NCName: @Name. + +-NameStartChar: ['A'-'Z']; + '_'; + ['a'-'z']. +-NameChar: NameStartChar; + '-'; + '.'; + ['0'-'9']. +Name: NameStartChar, NameChar*. +-s: -' '+. +Wildcard: '*'; + NCName, ':', '*'; + '*', ':', NCName; + BracedURILiteral, '*'. +Digits: ['0'-'9']+. + + diff --git a/tests/error/test-catalog.xml b/tests/error/test-catalog.xml new file mode 100644 index 00000000..baaabd6e --- /dev/null +++ b/tests/error/test-catalog.xml @@ -0,0 +1,99 @@ + + + +

Tests intended to demonstrate errors that processors are required + to raise.

+
+ + + + + + +

The number is so large it’s like to exceed the processor’s + ability to represent it.

+
+ s: #decafbadbadbadbad . + + + +
+ + + + +

Exceeds the range of Unicode.

+
+ s: #ffffffff0 . + + + +
+ + + + +

Not a Unicode character.

+
+ s: #fffe . + + + +
+ + + + +

Also not a Unicode character.

+
+ s: #1fffe . + + + +
+ + + + +

A Unicode surrogate.

+
+ s: #d801 . + + + +
+ + + + +

An invalid Unicode character class.

+
+ s: [Xq] . + + + +
+ + + + +

An invalid range.

+
+ s: ['Z'-'A'] . + + + +
+ +
diff --git a/tests/extra/ixml-one-line.output.xml b/tests/extra/ixml-one-line.output.xml new file mode 100644 index 00000000..28abf334 --- /dev/null +++ b/tests/extra/ixml-one-line.output.xml @@ -0,0 +1,495 @@ +all characters except line breaks; quotes must be doubledall characters except line breaks; quotes must be doubled diff --git a/tests/extra/para-test.disputed-output.xml b/tests/extra/para-test.disputed-output.xml new file mode 100644 index 00000000..97c6f058 --- /dev/null +++ b/tests/extra/para-test.disputed-output.xml @@ -0,0 +1,12 @@ + +
+ + AB December 2021 + + + Para entry. + + + + +
diff --git a/tests/extra/url1.disputed-output.xml b/tests/extra/url1.disputed-output.xml new file mode 100644 index 00000000..4d552c3c --- /dev/null +++ b/tests/extra/url1.disputed-output.xml @@ -0,0 +1,5 @@ + +:// + www.w3.org + /TR/1999/xhtml.html + diff --git a/tests/extra/xml.dup.output.xml b/tests/extra/xml.dup.output.xml new file mode 100644 index 00000000..94a5a97e --- /dev/null +++ b/tests/extra/xml.dup.output.xml @@ -0,0 +1,11 @@ + + Now is the time for stuff. diff --git a/tests/extra/xpath.output.xml b/tests/extra/xpath.output.xml new file mode 100644 index 00000000..917392e2 --- /dev/null +++ b/tests/extra/xpath.output.xml @@ -0,0 +1,50 @@ + + diff --git a/tests/ixml/bnf.inp b/tests/ixml/bnf.inp new file mode 100644 index 00000000..8a968d5d --- /dev/null +++ b/tests/ixml/bnf.inp @@ -0,0 +1,24 @@ +::= +-::= | +::= "::=" | + "::=" +::= +-::= | "|" +::= | +-::= | +::= +::= | | | +@::= "<" ">" +@::= "@" | "^" | "-" +::= +::= ["a"-"z"] | ["A"-"Z"] | ["0"-"9"] +::= | "-" | +@::= """" """" +::= | +::= [" "-"!"] | ["#"-"~"] | """""" {all characters, quotes must be doubled} +::= "[" "-" "]" +-::= """" """" | """" """" """" """" +-::= " " | | +::= "{" "}" +-::= | +-::= [" "-"|"] | "~" {Everything except: } diff --git a/tests/ixml/bnf.ixml b/tests/ixml/bnf.ixml new file mode 100644 index 00000000..effb2bf0 --- /dev/null +++ b/tests/ixml/bnf.ixml @@ -0,0 +1,26 @@ +ixml: S, rules. +-rules: rule; rule, rules. +rule: mark, name, S, -"::=", S, def; + name, S, -"::=", S, def. +def: alts. +-alts: alt; alt, -"|", S, alts. +alt: terms; empty. +-terms: term; term, terms. +empty: . +term: mark, name, S; name, S; string, S; range. +@name: -"<", letters, -">". +@mark: "@", S; "^", S; "-", S. +letters: letter, more-letters. +letter: ["a"-"z"]; ["A"-"Z"]; ["0"-"9"]. +more-letters: letter, more-letters; "-", more-letters; . +@string: -"""", chars, -"""". +chars: char, chars; char. +char: [" "-"!"]; ["#"-"~"]; -'"', '"'. {all characters, quotes must be doubled} +range: -"[", S, from, S, -"-", S, to, S, -"]", S. +@from: character. +@to: character. +-character: -"""", char, -""""; -"""", """", -"""", -"""". +-S: -[" "; #a], S; comment, S; . +comment: "{", schars, "}". +-schars: schar, schars; . +-schar: [" "-"|"]; "~". {Everything except: } diff --git a/tests/ixml/bnf.output.xml b/tests/ixml/bnf.output.xml new file mode 100644 index 00000000..36c6847e --- /dev/null +++ b/tests/ixml/bnf.output.xml @@ -0,0 +1,293 @@ + +{all characters, quotes must be doubled}{Everything except: } diff --git a/tests/ixml/ixml-no-spaces.inp b/tests/ixml/ixml-no-spaces.inp new file mode 100644 index 00000000..9d94beee --- /dev/null +++ b/tests/ixml/ixml-no-spaces.inp @@ -0,0 +1,60 @@ +ixml:S,rule+S,S. +-S:(whitespace;comment)*. +-whitespace:-[Zs];tab;{lf;}cr. +-tab:-#9. +{-lf:-#a.} +-cr:-#d. +comment:-"{",(cchar;comment)*,-"}". +-cchar:~["{}"]. +rule:(mark,S)?,name,S,["=:"],S,-alts,".". +@mark:["@^-"]. +alts:alt+([";|"],S). +alt:term*(",",S). +-term:factor; +option; +repeat0; +repeat1. +-factor:terminal; +nonterminal; +"(",S,alts,")",S. +repeat0:factor,"*",S,sep?. +repeat1:factor,"+",S,sep?. +option:factor,"?",S. +sep:factor. +nonterminal:(mark,S)?,name,S. +-terminal:literal; +charset. +literal:quoted; +encoded. +-quoted:(tmark,S)?,-string. +@name:namestart,namefollower*. +-namestart:["_";Ll;Lu;Lm;Lt;Lo]. +-namefollower:namestart;["-.·‿⁀";Nd;Mn]. +@tmark:["^-"]. +string:-'"',dstring,-'"',S; +-"'",sstring,-"'",S. +@dstring:dchar+. +@sstring:schar+. +dchar:~['"']; +'"',-'"'.{allcharacters,quotesmustbedoubled} +schar:~["'"]; +"'",-"'".{allcharacters,quotesmustbedoubled} +-encoded:(tmark,S)?,-"#",@hex,S. +hex:["0"-"9";"a"-"f";"A"-"F"]+. +-charset:inclusion; +exclusion. +inclusion:(tmark,S)?,set. +exclusion:(tmark,S)?,"~",S,set. +-set:"[",S,member*([";|"],S),"]",S. +-member:literal; +range; +class. +range:from,"-",S,to. +@from:character. +@to:character. +-character:-'"',dchar,-'"',S; +-"'",schar,-"'",S; +"#",hex,S. +class:@code,S. +code:letter,letter. +-letter:["a"-"z";"A"-"Z"]. diff --git a/tests/ixml/ixml-no-spaces.ixml b/tests/ixml/ixml-no-spaces.ixml new file mode 100644 index 00000000..a8722e00 --- /dev/null +++ b/tests/ixml/ixml-no-spaces.ixml @@ -0,0 +1,60 @@ +ixml:S,rule+S,S. +-S:(whitespace;comment)*. +-whitespace:-[Zs];tab;lf;cr. +-tab:-#9. +-lf:-#a. +-cr:-#d. +comment:-"{",(cchar;comment)*,-"}". +-cchar:~["{}"]. +rule:(mark,S)?,name,S,["=:"],S,-alts,".". +@mark:["@^-"]. +alts:alt+([";|"],S). +alt:term*(",",S). +-term:factor; +option; +repeat0; +repeat1. +-factor:terminal; +nonterminal; +"(",S,alts,")",S. +repeat0:factor,"*",S,sep?. +repeat1:factor,"+",S,sep?. +option:factor,"?",S. +sep:factor. +nonterminal:(mark,S)?,name,S. +-terminal:literal; +charset. +literal:quoted; +encoded. +-quoted:(tmark,S)?,-string. +@name:namestart,namefollower*. +-namestart:["_";Ll;Lu;Lm;Lt;Lo]. +-namefollower:namestart;["-.·‿⁀";Nd;Mn]. +@tmark:["^-"]. +string:-'"',dstring,-'"',S; +-"'",sstring,-"'",S. +@dstring:dchar+. +@sstring:schar+. +dchar:~['"']; +'"',-'"'.{allcharacters,quotesmustbedoubled} +schar:~["'"]; +"'",-"'".{allcharacters,quotesmustbedoubled} +-encoded:(tmark,S)?,-"#",@hex,S. +hex:["0"-"9";"a"-"f";"A"-"F"]+. +-charset:inclusion; +exclusion. +inclusion:(tmark,S)?,set. +exclusion:(tmark,S)?,"~",S,set. +-set:"[",S,member*([";|"],S),"]",S. +-member:literal; +range; +class. +range:from,"-",S,to. +@from:character. +@to:character. +-character:-'"',dchar,-'"',S; +-"'",schar,-"'",S; +"#",hex,S. +class:@code,S. +code:letter,letter. +-letter:["a"-"z";"A"-"Z"]. diff --git a/tests/ixml/ixml-no-spaces.output.xml b/tests/ixml/ixml-no-spaces.output.xml new file mode 100644 index 00000000..e6c916e6 --- /dev/null +++ b/tests/ixml/ixml-no-spaces.output.xml @@ -0,0 +1,407 @@ +:,+,.:(;)*.:[];;lf;.:.-lf:-#a.:.:,(;)*,.:~[].:,,,[],,,.:[].:+([],).:*(,).:;;;.:;;,,,,.:,,,.:,,,.:,,.:.:,,.:;.:;.:,.:,*.:[;;;;;].:;[;;].:[].:,,,;,,,.:+.:+.:~[];,.allcharacters,quotesmustbedoubled:~[];,.allcharacters,quotesmustbedoubled:,,,.:[-;-;-]+.:;.:,.:,,,.:,,*([],),,.:;;.:,,,.:.:.:,,,;,,,;,,.:,.:,.:[-;-]. diff --git a/tests/ixml/ixml-one-line.corr.output.xml b/tests/ixml/ixml-one-line.corr.output.xml new file mode 100644 index 00000000..9ee8cb76 --- /dev/null +++ b/tests/ixml/ixml-one-line.corr.output.xml @@ -0,0 +1,49 @@ +:,+,.:(;)*.:[];;;.:.:.:.:,(;)*,.:~[].:,,,[],,,.:[].:+([],).:*(,).:;;;.:;;,,,,.:,,,.:,,,.:,,.:.:,,.:;.:;.:,.:,*.:[;;;;;].:;[;;].:[].:,,,;,,,.:+.:+.:~[];,.allcharacters,quotesmustbedoubled:~[];,.allcharacters,quotesmustbedoubled:,,,.:[-;-;-]+.:;.:,.:,,,.:,,*([],),,.:;;.:,,,.:.:.:,,,;,,,;,,.:,.:,.:[-;-]. diff --git a/tests/ixml/ixml-one-line.inp b/tests/ixml/ixml-one-line.inp new file mode 100644 index 00000000..093fda44 --- /dev/null +++ b/tests/ixml/ixml-one-line.inp @@ -0,0 +1 @@ +ixml:S,rule+S,S.-S:(whitespace;comment)*.-whitespace:-[Zs];tab;lf;cr.-tab:-#9.-lf:-#a.-cr:-#d.comment:-"{",(cchar;comment)*,-"}".-cchar:~["{}"].rule:(mark,S)?,name,S,["=:"],S,-alts,".".@mark:["@^-"].alts:alt+([";|"],S).alt:term*(",",S).-term:factor;option;repeat0;repeat1.-factor:terminal;nonterminal;"(",S,alts,")",S.repeat0:factor,"*",S,sep?.repeat1:factor,"+",S,sep?.option:factor,"?",S.sep:factor.nonterminal:(mark,S)?,name,S.-terminal:literal;charset.literal:quoted;encoded.-quoted:(tmark,S)?,-string.@name:namestart,namefollower*.-namestart:["_";Ll;Lu;Lm;Lt;Lo].-namefollower:namestart;["-.·‿⁀";Nd;Mn].@tmark:["^-"].string:-'"',dstring,-'"',S;-"'",sstring,-"'",S.@dstring:dchar+.@sstring:schar+.dchar:~['"'];'"',-'"'.{allcharacters,quotesmustbedoubled}schar:~["'"];"'",-"'".{allcharacters,quotesmustbedoubled}-encoded:(tmark,S)?,-"#",@hex,S.hex:["0"-"9";"a"-"f";"A"-"F"]+.-charset:inclusion;exclusion.inclusion:(tmark,S)?,set.exclusion:(tmark,S)?,"~",S,set.-set:"[",S,member*([";|"],S),"]",S.-member:literal;range;class.range:from,"-",S,to.@from:character.@to:character.-character:-'"',dchar,-'"',S;-"'",schar,-"'",S;"#",hex,S.class:@code,S.code:letter,letter.-letter:["a"-"z";"A"-"Z"]. \ No newline at end of file diff --git a/tests/ixml/ixml-one-line.ixml b/tests/ixml/ixml-one-line.ixml new file mode 100644 index 00000000..093fda44 --- /dev/null +++ b/tests/ixml/ixml-one-line.ixml @@ -0,0 +1 @@ +ixml:S,rule+S,S.-S:(whitespace;comment)*.-whitespace:-[Zs];tab;lf;cr.-tab:-#9.-lf:-#a.-cr:-#d.comment:-"{",(cchar;comment)*,-"}".-cchar:~["{}"].rule:(mark,S)?,name,S,["=:"],S,-alts,".".@mark:["@^-"].alts:alt+([";|"],S).alt:term*(",",S).-term:factor;option;repeat0;repeat1.-factor:terminal;nonterminal;"(",S,alts,")",S.repeat0:factor,"*",S,sep?.repeat1:factor,"+",S,sep?.option:factor,"?",S.sep:factor.nonterminal:(mark,S)?,name,S.-terminal:literal;charset.literal:quoted;encoded.-quoted:(tmark,S)?,-string.@name:namestart,namefollower*.-namestart:["_";Ll;Lu;Lm;Lt;Lo].-namefollower:namestart;["-.·‿⁀";Nd;Mn].@tmark:["^-"].string:-'"',dstring,-'"',S;-"'",sstring,-"'",S.@dstring:dchar+.@sstring:schar+.dchar:~['"'];'"',-'"'.{allcharacters,quotesmustbedoubled}schar:~["'"];"'",-"'".{allcharacters,quotesmustbedoubled}-encoded:(tmark,S)?,-"#",@hex,S.hex:["0"-"9";"a"-"f";"A"-"F"]+.-charset:inclusion;exclusion.inclusion:(tmark,S)?,set.exclusion:(tmark,S)?,"~",S,set.-set:"[",S,member*([";|"],S),"]",S.-member:literal;range;class.range:from,"-",S,to.@from:character.@to:character.-character:-'"',dchar,-'"',S;-"'",schar,-"'",S;"#",hex,S.class:@code,S.code:letter,letter.-letter:["a"-"z";"A"-"Z"]. \ No newline at end of file diff --git a/tests/ixml/ixml-spaces.inp b/tests/ixml/ixml-spaces.inp new file mode 100644 index 00000000..7c0ead13 --- /dev/null +++ b/tests/ixml/ixml-spaces.inp @@ -0,0 +1,60 @@ + ixml : S , rule + S , S . + - S : ( whitespace ; comment ) * . + - whitespace : - [ Zs ] ; tab ; { lf ; } cr . + - tab : - #9 . +{ - lf : - #a . } + - cr : - #d . + comment : - "{" , ( cchar ; comment ) * , - "}" . + - cchar : ~ [ "{}" ] . + rule : ( mark , S ) ? , name , S , [ "=:" ] , S , - alts , "." . + @ mark : [ "@^-" ] . + alts : alt + ( [ ";|" ] , S ) . + alt : term * ( "," , S ) . + - term : factor ; + option ; + repeat0 ; + repeat1 . + - factor : terminal ; + nonterminal ; + "(" , S , alts , ")" , S . + repeat0 : factor , "*" , S , sep ? . + repeat1 : factor , "+" , S , sep ? . + option : factor , "?" , S . + sep : factor . + nonterminal : ( mark , S ) ? , name , S . + - terminal : literal ; + charset . + literal : quoted ; + encoded . + - quoted : ( tmark , S ) ? , - string . + @ name : namestart , namefollower * . + - namestart : [ "_" ; Ll ; Lu ; Lm ; Lt ; Lo ] . + - namefollower : namestart ; [ "-.·‿⁀" ; Nd ; Mn ] . + @ tmark : [ "^-" ] . + string : - '"' , dstring , - '"' , S ; + - "'" , sstring , - "'" , S . + @ dstring : dchar + . + @ sstring : schar + . + dchar : ~ [ '"' ] ; + '"' , - '"' . { all characters , quotes must be doubled } + schar : ~ [ "'" ] ; + "'" , - "'" . { all characters , quotes must be doubled } + - encoded : ( tmark , S ) ? , - "#" , @ hex , S . + hex : [ "0" - "9" ; "a" - "f" ; "A" - "F" ] + . + - charset : inclusion ; + exclusion . + inclusion : ( tmark , S ) ? , set . + exclusion : ( tmark , S ) ? , "~" , S , set . + - set : "[" , S , member * ( [ ";|" ] , S ) , "]" , S . + - member : literal ; + range ; + class . + range : from , "-" , S , to . + @ from : character . + @ to : character . + - character : - '"' , dchar , - '"' , S ; + - "'" , schar , - "'" , S ; + "#" , hex , S . + class : @ code , S . + code : letter , letter . + - letter : [ "a" - "z" ; "A" - "Z" ] . diff --git a/tests/ixml/ixml-spaces.ixml b/tests/ixml/ixml-spaces.ixml new file mode 100644 index 00000000..9ee14e37 --- /dev/null +++ b/tests/ixml/ixml-spaces.ixml @@ -0,0 +1,60 @@ + ixml : S , rule + S , S . + - S : ( whitespace ; comment ) * . + - whitespace : - [ Zs ] ; tab ; lf ; cr . + - tab : - #9 . + - lf : - #a . + - cr : - #d . + comment : - "{" , ( cchar ; comment ) * , - "}" . + - cchar : ~ [ "{}" ] . + rule : ( mark , S ) ? , name , S , [ "=:" ] , S , - alts , "." . + @ mark : [ "@^-" ] . + alts : alt + ( [ ";|" ] , S ) . + alt : term * ( "," , S ) . + - term : factor ; + option ; + repeat0 ; + repeat1 . + - factor : terminal ; + nonterminal ; + "(" , S , alts , ")" , S . + repeat0 : factor , "*" , S , sep ? . + repeat1 : factor , "+" , S , sep ? . + option : factor , "?" , S . + sep : factor . + nonterminal : ( mark , S ) ? , name , S . + - terminal : literal ; + charset . + literal : quoted ; + encoded . + - quoted : ( tmark , S ) ? , - string . + @ name : namestart , namefollower * . + - namestart : [ "_" ; Ll ; Lu ; Lm ; Lt ; Lo ] . + - namefollower : namestart ; [ "-.·‿⁀" ; Nd ; Mn ] . + @ tmark : [ "^-" ] . + string : - '"' , dstring , - '"' , S ; + - "'" , sstring , - "'" , S . + @ dstring : dchar + . + @ sstring : schar + . + dchar : ~ [ '"' ] ; + '"' , - '"' . { all characters , quotes must be doubled } + schar : ~ [ "'" ] ; + "'" , - "'" . { all characters , quotes must be doubled } + - encoded : ( tmark , S ) ? , - "#" , @ hex , S . + hex : [ "0" - "9" ; "a" - "f" ; "A" - "F" ] + . + - charset : inclusion ; + exclusion . + inclusion : ( tmark , S ) ? , set . + exclusion : ( tmark , S ) ? , "~" , S , set . + - set : "[" , S , member * ( [ ";|" ] , S ) , "]" , S . + - member : literal ; + range ; + class . + range : from , "-" , S , to . + @ from : character . + @ to : character . + - character : - '"' , dchar , - '"' , S ; + - "'" , schar , - "'" , S ; + "#" , hex , S . + class : @ code , S . + code : letter , letter . + - letter : [ "a" - "z" ; "A" - "Z" ] . diff --git a/tests/ixml/ixml-spaces.output.xml b/tests/ixml/ixml-spaces.output.xml new file mode 100644 index 00000000..30149b70 --- /dev/null +++ b/tests/ixml/ixml-spaces.output.xml @@ -0,0 +1,407 @@ +:,+,.:(;)*.:[];; lf ; .:. - lf : - #a . :.:,(;)*,.:~[].:,,,[],,,.:[].:+([],).:*(,).:;;;.:;;,,,,.:,,,.:,,,.:,,.:.:,,.:;.:;.:,.:,*.:[;;;;;].:;[;;].:[].:,,,;,,,.:+.:+.:~[];,. all characters , quotes must be doubled :~[];,. all characters , quotes must be doubled :,,,.:[-;-;-]+.:;.:,.:,,,.:,,*([],),,.:;;.:,,,.:.:.:,,,;,,,;,,.:,.:,.:[-;-]. diff --git a/tests/ixml/ixml.inp b/tests/ixml/ixml.inp new file mode 100644 index 00000000..f2be592b --- /dev/null +++ b/tests/ixml/ixml.inp @@ -0,0 +1,67 @@ + ixml: s, rule+s, s. + + -s: (whitespace; comment)*. + -whitespace: -[Zs]; tab; lf; cr. + -tab: -#9. + -lf: -#a. + -cr: -#d. + comment: -"{", (cchar; comment)*, -"}". + -cchar: ~["{}"]. + + rule: (mark, s)?, name, s, -["=:"], s, -alts, -".". + @mark: ["@^-"]. + alts: alt+(-[";|"], s). + alt: term*(-",", s). + -term: factor; + option; + repeat0; + repeat1. + -factor: terminal; + nonterminal; + -"(", s, alts, -")", s. + repeat0: factor, -"*", s, sep?. + repeat1: factor, -"+", s, sep?. + option: factor, -"?", s. + sep: factor. + nonterminal: (mark, s)?, name, s. + + -terminal: literal; + charset. + literal: quoted; + encoded. + -quoted: (tmark, s)?, -string. + + @name: namestart, namefollower*. + -namestart: ["_"; L]. +-namefollower: namestart; ["-.·‿⁀"; Nd; Mn]. + + @tmark: ["^-"]. + string: -'"', dstring, -'"', s; + -"'", sstring, -"'", s. + @dstring: dchar+. + @sstring: schar+. + dchar: ~['"'; #a; #d]; + '"', -'"'. {all characters except line breaks; quotes must be doubled} + schar: ~["'"; #a; #d]; + "'", -"'". {all characters except line breaks; quotes must be doubled} + -encoded: (tmark, s)?, -"#", @hex, s. + hex: ["0"-"9"; "a"-"f"; "A"-"F"]+. + + -charset: inclusion; + exclusion. + inclusion: (tmark, s)?, set. + exclusion: (tmark, s)?, -"~", s, set. + -set: -"[", s, member*(-[";|"], s), -"]", s. + -member: literal; + range; + class. + range: from, -"-", s, to. + @from: character. + @to: character. + -character: -'"', dchar, -'"', s; + -"'", schar, -"'", s; + "#", hex, s. + class: code, s. + @code: capital, letter?. + -capital: ["A"-"Z"]. + -letter: ["a"-"z"]. diff --git a/tests/ixml/ixml.ixml b/tests/ixml/ixml.ixml new file mode 100644 index 00000000..f2be592b --- /dev/null +++ b/tests/ixml/ixml.ixml @@ -0,0 +1,67 @@ + ixml: s, rule+s, s. + + -s: (whitespace; comment)*. + -whitespace: -[Zs]; tab; lf; cr. + -tab: -#9. + -lf: -#a. + -cr: -#d. + comment: -"{", (cchar; comment)*, -"}". + -cchar: ~["{}"]. + + rule: (mark, s)?, name, s, -["=:"], s, -alts, -".". + @mark: ["@^-"]. + alts: alt+(-[";|"], s). + alt: term*(-",", s). + -term: factor; + option; + repeat0; + repeat1. + -factor: terminal; + nonterminal; + -"(", s, alts, -")", s. + repeat0: factor, -"*", s, sep?. + repeat1: factor, -"+", s, sep?. + option: factor, -"?", s. + sep: factor. + nonterminal: (mark, s)?, name, s. + + -terminal: literal; + charset. + literal: quoted; + encoded. + -quoted: (tmark, s)?, -string. + + @name: namestart, namefollower*. + -namestart: ["_"; L]. +-namefollower: namestart; ["-.·‿⁀"; Nd; Mn]. + + @tmark: ["^-"]. + string: -'"', dstring, -'"', s; + -"'", sstring, -"'", s. + @dstring: dchar+. + @sstring: schar+. + dchar: ~['"'; #a; #d]; + '"', -'"'. {all characters except line breaks; quotes must be doubled} + schar: ~["'"; #a; #d]; + "'", -"'". {all characters except line breaks; quotes must be doubled} + -encoded: (tmark, s)?, -"#", @hex, s. + hex: ["0"-"9"; "a"-"f"; "A"-"F"]+. + + -charset: inclusion; + exclusion. + inclusion: (tmark, s)?, set. + exclusion: (tmark, s)?, -"~", s, set. + -set: -"[", s, member*(-[";|"], s), -"]", s. + -member: literal; + range; + class. + range: from, -"-", s, to. + @from: character. + @to: character. + -character: -'"', dchar, -'"', s; + -"'", schar, -"'", s; + "#", hex, s. + class: code, s. + @code: capital, letter?. + -capital: ["A"-"Z"]. + -letter: ["a"-"z"]. diff --git a/tests/ixml/ixml.output.xml b/tests/ixml/ixml.output.xml new file mode 100644 index 00000000..28abf334 --- /dev/null +++ b/tests/ixml/ixml.output.xml @@ -0,0 +1,495 @@ +all characters except line breaks; quotes must be doubledall characters except line breaks; quotes must be doubled diff --git a/tests/ixml/ixml1.inp b/tests/ixml/ixml1.inp new file mode 100644 index 00000000..88865983 --- /dev/null +++ b/tests/ixml/ixml1.inp @@ -0,0 +1,45 @@ +ixml: S, rule+. +rule: mark?, name, S, ":", S, def, ".", S. +def: alt+(";", S). +alt: term*(",", S). +-term: factor; repeat0; repeat1; option. +repeat0: factor, "*", S, sep?. +sep: factor. +repeat1: factor, "+", S, sep?. +option: factor, "?", S. +-factor: nonterminal; terminal; "(", S, def, ")", S. +nonterminal: mark?, name, S. +terminal: mark?, (quoted; hex; charset; exclude). + +charset: "[", S, element+(";", S), "]", S. +exclude: "~", S, -charset. + +-element: range; character, S; class, S. +range: from, S, "-", S, to, S. +@from: character. +@to: character. +class: letter, letter. {One of the Unicode character classes} + +@name: letgit, xletter*. +-letgit: letter; digit. +-letter: ["a"-"z"; "A"-"Z"]. +-digit: ["0"-"9"]. +-xletter: letgit; "-". + +@mark: "@", S; "^", S; "-", S. + +quoted: -'"', dstring, -'"', S; -"'", sstring, -"'", S. +@dstring: dchar+. +@sstring: schar+. +dchar: ~['"']; '""'. {all characters, dquotes must be doubled} +schar: ~["'"]; "''". {all characters, squotes must be doubled} +-character: '"', dchar, '"'; "'", schar, "'"; hex. + +hex: "#", number. +number: hexit+. +-hexit: digit; ["a"-"f"; "A"-"F"]. + +-S: (" "; comment)*. +comment: "{", cchar*, "}". +-cchar: ~["}"]. +{the end} diff --git a/tests/ixml/ixml1.ixml b/tests/ixml/ixml1.ixml new file mode 100644 index 00000000..158f6b62 --- /dev/null +++ b/tests/ixml/ixml1.ixml @@ -0,0 +1,45 @@ +ixml: S, rule+. +rule: mark?, name, S, ":", S, def, ".", S. +def: alt+(";", S). +alt: term*(",", S). +-term: factor; repeat0; repeat1; option. +repeat0: factor, "*", S, sep?. +sep: factor. +repeat1: factor, "+", S, sep?. +option: factor, "?", S. +-factor: nonterminal; terminal; "(", S, def, ")", S. +nonterminal: mark?, name, S. +terminal: mark?, (quoted; hex; charset; exclude). + +charset: "[", S, element+(";", S), "]", S. +exclude: "~", S, -charset. + +-element: range; character, S; class, S. +range: from, S, "-", S, to, S. +@from: character. +@to: character. +class: letter, letter. {One of the Unicode character classes} + +@name: letgit, xletter*. +-letgit: letter; digit. +-letter: ["a"-"z"; "A"-"Z"]. +-digit: ["0"-"9"]. +-xletter: letgit; "-". + +@mark: "@", S; "^", S; "-", S. + +quoted: -'"', dstring, -'"', S; -"'", sstring, -"'", S. +@dstring: dchar+. +@sstring: schar+. +dchar: ~['"']; '""'. {all characters, dquotes must be doubled} +schar: ~["'"]; "''". {all characters, squotes must be doubled} +-character: '"', dchar, '"'; "'", schar, "'"; hex. + +hex: "#", number. +number: hexit+. +-hexit: digit; ["a"-"f"; "A"-"F"]. + +-S: (" "; -#a; comment)*. +comment: "{", cchar*, "}". +-cchar: ~["}"]. +{the end} diff --git a/tests/ixml/ixml1.output.xml b/tests/ixml/ixml1.output.xml new file mode 100644 index 00000000..ee768841 --- /dev/null +++ b/tests/ixml/ixml1.output.xml @@ -0,0 +1,478 @@ +: , +.: , , , , , , , .: +(, ).: *(, ).: ; ; ; .: , , , .: .: , , , .: , , .: ; ; , , , , .: , , .: , (; ; ; ).: , , +(, ), , .: , , .: ; , ; , .: , , , , , .: .: .: , . {One of the Unicode character classes}: , *.: ; .: [-; -].: [-].: ; .: , ; , ; , .: , , , ; , , , .: +.: +.: ~['"']; . {all characters, dquotes must be doubled}: ~["'"]; . {all characters, squotes must be doubled}: , , ; , , ; .: , .: +.: ; [-; -].: (; )*.: , *, .: ~["}"].{the end} diff --git a/tests/ixml/ixml2.inp b/tests/ixml/ixml2.inp new file mode 100644 index 00000000..fd94e434 --- /dev/null +++ b/tests/ixml/ixml2.inp @@ -0,0 +1,60 @@ +ixml: S, rule+. +rule: mark?, name, ["=:"], S, -alts, ".", S. +alts: alt+([";|"], S). +alt: term*(",", S). +-term: factor; + repeat0; + repeat1; + option. +repeat0: factor, "*", S, sep?. +repeat1: factor, "+", S, sep?. +option: factor, "?", S. +sep: factor. + +-factor: nonterminal; + terminal; + "(", S, alts, ")", S. +nonterminal: mark?, name. +-terminal: literal; charset. +-literal: quoted; encoded. +-charset: inclusion; exclusion. + +quoted: tmark?, -string. +encoded: tmark?, @hex. +inclusion: tmark?, "[", S, element+([";|"], S), "]", S. +exclusion: tmark?, "~", S, "[", S, element+([";|"], S), "]", S. + +@tmark: ["^-"], S. +-element: range; + string; + hex; + class. +range: from, "-", S, to. +@from: character. +@to: character. +-character: -'"', dchar, -'"', S; + -"'", schar, -"'", S; + hex. +class: letter, letter, S. {One of the Unicode character classes} +-letter: ["a"-"z"; "A"-"Z"]. +@name: letgit, xletter*, S. +-letgit: ["a"-"z"; "A"-"Z"; "0"-"9"]. +-xletter: letgit; "-". + +@mark: ["@^-"], S. + +string: -'"', dstring, -'"', S; + -"'", sstring, -"'", S. +@dstring: dchar+. +@sstring: schar+. +dchar: ~['"']; + '""'. {all characters, dquotes must be doubled} +schar: ~["'"]; + "''". {all characters, squotes must be doubled} +hex: "#", ["0"-"9"; "a"-"f"; "A"-"F"]+, S. + +-S: (" "; #9; comment)*. +comment: "{", cchar*, "}". +-cchar: ~["}"]. + +{the end} diff --git a/tests/ixml/ixml2.ixml b/tests/ixml/ixml2.ixml new file mode 100644 index 00000000..547bbb17 --- /dev/null +++ b/tests/ixml/ixml2.ixml @@ -0,0 +1,60 @@ +ixml: S, rule+. +rule: mark?, name, ["=:"], S, -alts, ".", S. +alts: alt+([";|"], S). +alt: term*(",", S). +-term: factor; + repeat0; + repeat1; + option. +repeat0: factor, "*", S, sep?. +repeat1: factor, "+", S, sep?. +option: factor, "?", S. +sep: factor. + +-factor: nonterminal; + terminal; + "(", S, alts, ")", S. +nonterminal: mark?, name. +-terminal: literal; charset. +-literal: quoted; encoded. +-charset: inclusion; exclusion. + +quoted: tmark?, -string. +encoded: tmark?, @hex. +inclusion: tmark?, "[", S, element+([";|"], S), "]", S. +exclusion: tmark?, "~", S, "[", S, element+([";|"], S), "]", S. + +@tmark: ["^-"], S. +-element: range; + string; + hex; + class. +range: from, "-", S, to. +@from: character. +@to: character. +-character: -'"', dchar, -'"', S; + -"'", schar, -"'", S; + hex. +class: letter, letter, S. {One of the Unicode character classes} +-letter: ["a"-"z"; "A"-"Z"]. +@name: letgit, xletter*, S. +-letgit: ["a"{first}-"z"; "A"-"Z"; "0"-"9"]. +-xletter: letgit; "-". + +@mark: ["@^-"], S. + +string: -'"', dstring, -'"', S; + -"'", sstring, -"'", S. +@dstring: dchar+. +@sstring: schar+. +dchar: ~['"']; + '""'. {all characters, dquotes must be doubled} +schar: ~["'"]; + "''". {all characters, squotes must be doubled} +hex: "#", ["0"-"9"; "a"-"f"; "A"-"F"]+, S. + +-S: (" "; -#a; #9; comment)*. +comment: "{", cchar*, "}". +-cchar: ~["}"]. + +{the end} diff --git a/tests/ixml/ixml2.output.xml b/tests/ixml/ixml2.output.xml new file mode 100644 index 00000000..d4e6b0c5 --- /dev/null +++ b/tests/ixml/ixml2.output.xml @@ -0,0 +1,354 @@ +: , +.: , , [], , , , .: +([], ).: *(, ).: ; ; ; .: , , , .: , , , .: , , .: .: ; ; , , , , .: , .: ; .: ; .: ; .: , .: , .: , , , +([], ), , .: , , , , , +([], ), , .: [], .: ; ; ; .: , , , .: .: .: , , , ; , , , ; .: , , . {One of the Unicode character classes}: [-; -].: , *, .: [-; -; -].: ; .: [], .: , , , ; , , , .: +.: +.: ~[]; . {all characters, dquotes must be doubled}: ~[]; . {all characters, squotes must be doubled}: , [-; -; -]+, .: (; ; )*.: , *, .: ~[].{the end} diff --git a/tests/ixml/ixml3.inp b/tests/ixml/ixml3.inp new file mode 100644 index 00000000..98f81dbf --- /dev/null +++ b/tests/ixml/ixml3.inp @@ -0,0 +1,61 @@ +{A test for what happens with comments in nonterminal attributes} +ixml: S, rule+. +rule: mark?, name, ["=:"], S, -alts, ".", S. +alts: alt+([";|"], S). +alt: term*(",", S). +-term: factor; + repeat0; + repeat1; + option. +repeat0: factor, "*", S, sep?. +repeat1: factor, "+", S, sep?. +option: factor, "?", S. +sep: factor. + +-factor: nonterminal; + terminal; + "(", S, alts, ")", S. +nonterminal: mark?, name. +-terminal: literal; charset. +-literal: quoted; encoded. +-charset: inclusion; exclusion. + +quoted: tmark?, -string. +encoded: tmark?, @hex. +inclusion: tmark?, "[", S, element+([";|"], S), "]", S. +exclusion: tmark?, "~", S, "[", S, element+([";|"], S), "]", S. + +@tmark: ["^-"], S. +-element: range; + string; + hex; + class. +range: from, S, "-", S, to, S. +@from: character. +@to: character. +-character: -'"', dchar, -'"'; + -"'", schar, -"'"; + hex. +class: letter, letter, S. {One of the Unicode character classes} +-letter: ["a"-"z"; "A"-"Z"]. +@name: letgit, xletter*, S. +-letgit: ["a" - "z" {AAA}; "A"-"Z"; "0"-"9"]. +-xletter: letgit; "-". + +@mark: ["@^-"], S. + +string: -'"', dstring, -'"', S; + -"'", sstring, -"'", S. +@dstring: dchar+. +@sstring: schar+. +dchar: ~['"']; + '""'. {all characters, dquotes must be doubled} +schar: ~["'"]; + "''". {all characters, squotes must be doubled} +hex: "#", ["0"-"9"; "a"-"f"; "A"-"F"]+. + +-S: (" "; #9 {comment}; comment)*. +^comment: "{", cchar*, "}". +-cchar: ~["}"]. + +{the end} diff --git a/tests/ixml/ixml3.ixml b/tests/ixml/ixml3.ixml new file mode 100644 index 00000000..46db8852 --- /dev/null +++ b/tests/ixml/ixml3.ixml @@ -0,0 +1,61 @@ +{A test for what happens with comments in nonterminal attributes} +ixml: S, rule+. +rule: mark?, name, ["=:"], S, -alts, ".", S. +alts: alt+([";|"], S). +alt: term*(",", S). +-term: factor; + repeat0; + repeat1; + option. +repeat0: factor, "*", S, sep?. +repeat1: factor, "+", S, sep?. +option: factor, "?", S. +sep: factor. + +-factor: nonterminal; + terminal; + "(", S, alts, ")", S. +nonterminal: mark?, name. +-terminal: literal; charset. +-literal: quoted; encoded. +-charset: inclusion; exclusion. + +quoted: tmark?, -string. +encoded: tmark?, @hex, S. +inclusion: tmark?, "[", S, element+([";|"], S), "]", S. +exclusion: tmark?, "~", S, "[", S, element+([";|"], S), "]", S. + +@tmark: ["^-"], S. +-element: range; + string; + hex; + class. +range: from, S, "-", S, to, S. +@from: character. +@to: character. +-character: -'"', dchar, -'"'; + -"'", schar, -"'"; + hex. +class: letter, letter, S. {One of the Unicode character classes} +-letter: ["a"-"z"; "A"-"Z"]. +@name: letgit, xletter*, S. +-letgit: ["a" - "z" {AAA}; "A"-"Z"; "0"-"9"]. +-xletter: letgit; "-". + +@mark: ["@^-"], S. + +string: -'"', dstring, -'"', S; + -"'", sstring, -"'", S. +@dstring: dchar+. +@sstring: schar+. +dchar: ~['"']; + '""'. {all characters, dquotes must be doubled} +schar: ~["'"]; + "''". {all characters, squotes must be doubled} +hex: "#", ["0"-"9"; "a"-"f"; "A"-"F"]+. + +-S: (" "; #9 {comment}; -#a; comment)*. +^comment: "{", cchar*, "}". +-cchar: ~["}"]. + +{the end} diff --git a/tests/ixml/ixml3.output.xml b/tests/ixml/ixml3.output.xml new file mode 100644 index 00000000..4cf19830 --- /dev/null +++ b/tests/ixml/ixml3.output.xml @@ -0,0 +1,358 @@ +{A test for what happens with comments in nonterminal attributes}: , +.: , , [], , , , .: +([], ).: *(, ).: ; ; ; .: , , , .: , , , .: , , .: .: ; ; , , , , .: , .: ; .: ; .: ; .: , .: , .: , , , +([], ), , .: , , , , , +([], ), , .: [], .: ; ; ; .: , , , , , .: .: .: , , ; , , ; .: , , . {One of the Unicode character classes}: [-; -].: , *, .: [ - {AAA}; -; -].: ; .: [], .: , , , ; , , , .: +.: +.: ~[]; . {all characters, dquotes must be doubled}: ~[]; . {all characters, squotes must be doubled}: , [-; -; -]+.: (; {comment}; )*.: , *, .: ~[].{the end} diff --git a/tests/ixml/test-catalog.xml b/tests/ixml/test-catalog.xml new file mode 100644 index 00000000..efbb31c2 --- /dev/null +++ b/tests/ixml/test-catalog.xml @@ -0,0 +1,182 @@ + + + +

Tests provided by Steven Pemberton in December 2021, + with corrections of 21 December. Reorganized by Norm Tovey-Walsh, February 2022.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Note that the input and the grammar are the same.

+

The expected result supplied in tests.zip cannot be + right: it excludes the delimiter characters like ":" and + ",", although they are not marked hidden in the grammar, and + it marks whitespace as "s", not as "S". It appears to be + parsing the input with a different grammar more like the + current ixml grammar. It has been replaced by output + thought correct. However, the old file has been left in + place, in case of disputes over correctness.

+

The comment text "allcharacters,quotesmustbedoubled" + looks like a possible error, but the comments in the input + are also missing inter-word spaces.

+
+ + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

The exclude element in the rule for dchar included 18 + extraneous blanks; after the exclude, the terminal element + concluded with an extraneous blank line containing 12 + blanks. These have now been removed.

+
+ + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/tests/parse/parse-error.inp b/tests/parse/parse-error.inp new file mode 100644 index 00000000..b47f17e1 --- /dev/null +++ b/tests/parse/parse-error.inp @@ -0,0 +1 @@ +a b<&>. diff --git a/tests/parse/parse-error.ixml b/tests/parse/parse-error.ixml new file mode 100644 index 00000000..cbc2ebf5 --- /dev/null +++ b/tests/parse/parse-error.ixml @@ -0,0 +1,2 @@ +a: "a", punctuation, "b". +punctuation: [",.;:'?!"]. diff --git a/tests/parse/test-catalog.xml b/tests/parse/test-catalog.xml new file mode 100644 index 00000000..274e5ed4 --- /dev/null +++ b/tests/parse/test-catalog.xml @@ -0,0 +1,60 @@ + + + +

Tests provided by Steven Pemberton in December 2021, + with corrections of 21 December. Reorganized by Norm Tovey-Walsh, February 2022.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

The grammar as written allows no hash mark and fragment + identifier. The intent may have been to formulate a grammar + that allowed them, but the simplest change to bring it into + a consistent state is to change the result.

+

For the record, the originally asserted result is now in + file 'url1.disputed-output.xml'.

+
+
+
+ +
diff --git a/tests/parse/url.inp b/tests/parse/url.inp new file mode 100644 index 00000000..827f31e6 --- /dev/null +++ b/tests/parse/url.inp @@ -0,0 +1 @@ +http://http://@http://http:// diff --git a/tests/parse/url.ixml b/tests/parse/url.ixml new file mode 100644 index 00000000..caab158c --- /dev/null +++ b/tests/parse/url.ixml @@ -0,0 +1,12 @@ +url: scheme, ":", authority, path. +scheme: name. +@name: letter+. +authority: "//", host. +host: sub+".". +sub: name. +path: ("/", seg)+. +seg: sname. +@sname: fletter*. +-letter: ["a"-"z"]; ["A"-"Z"]; ["0"-"9"]. +-fletter: letter; ".". + diff --git a/tests/parse/url1.inp b/tests/parse/url1.inp new file mode 100644 index 00000000..3fe4ab92 --- /dev/null +++ b/tests/parse/url1.inp @@ -0,0 +1,2 @@ +http://www.w3.org/TR/1999/xhtml.html#date?name=fred + diff --git a/tests/parse/url1.ixml b/tests/parse/url1.ixml new file mode 100644 index 00000000..5b7336ef --- /dev/null +++ b/tests/parse/url1.ixml @@ -0,0 +1,12 @@ +url: scheme, ":", authority, path. +@scheme: name. +-name: letter+. +-authority: "//", host. +host: sub+".". +-sub: name. +path: ("/", seg)+. +-seg: sname. +-sname: fletter*. +-letter: ["a"-"z"]; ["A"-"Z"]; ["0"-"9"]. +-fletter: letter; ".". + diff --git a/tests/readme.md b/tests/readme.md new file mode 100644 index 00000000..f8d3a115 --- /dev/null +++ b/tests/readme.md @@ -0,0 +1,51 @@ +This directory contains a version of Steven Pemberton's tests.zip +file, modified by Michael Sperberg-McQueen, further modified by Norm +Tovey-Walsh, and packaged with a test catalog using the test-catalog +vocabulary defined in MSM's ixml-tests repository. + +Note that a large number of tests have been changed vis-a-vis +tests.zip. In many cases this only involved removing non-grammatical +whitespace from the end of the input, but in other cases larger +interventions were made. The details are in the catalog. + +In each directory, there are three files per test: +* file.ixml: the ixml grammar for the test +* file.inp: sample input for that grammar +* file.req: the expected output for that input + +MSM has made test catalogs for these tests using the test-catalog vocabulary +defined in his ixml-tests repository. + +* SP-syntaxtests-package.zip +* tests-SP-MSM + +For the syntaxtests directory, the result is packaged in a zip file +containing the tests and catalogs. (No input files or result files +are included, because none are needed: none of the tests consume input +other than the grammar files, and none produces output.) + +The SP-syntaxtests-package.zip file contains three catalogs, which +specify the tests in different ways: + +* In `catalog-as-instance-tests-ixml.xml`, the catalog specifies the +tests as instance tests, using `../../../ixml.ixml` as the grammar +against which they are to be parsed. + +* In `catalog-as-instance-tests-xml.xml`, the catalog specifies the +tests as instance tests, using `../../../ixml.xml` as the grammar +against which they are to be parsed. Processors which don't support +the XML form of ixml grammars won't want to bother running these. + +* In `catalog-as-grammar-tests.xml`, the processor is to use its +inbuilt ixml grammar. Assuming the processor is using the current +ixml grammar, the results should be the same. + +In the directory tests-SP-MSM, a test catalog is packaged with +corrected input and result files. As noted case by case in the +catalog, a number of changes were made, often stripping ungrammatical +whitespace from the input files and in some case stripping +nonsignficant whitespace from the expected result, to avoid causing +problems for XML comparators using deep-equal(), or for other +comparators looking at output with different pretty-printing +practices. + diff --git a/tests/reference/ixml.ixml b/tests/reference/ixml.ixml new file mode 100644 index 00000000..6bff1cfd --- /dev/null +++ b/tests/reference/ixml.ixml @@ -0,0 +1,67 @@ +{ From the 2022-01-25 draft specification } + + ixml: s, rule+s, s. + + -s: (whitespace; comment)*. + -whitespace: -[Zs]; tab; lf; cr. + -tab: -#9. + -lf: -#a. + -cr: -#d. + comment: -"{", (cchar; comment)*, -"}". + -cchar: ~["{}"]. + + rule: (mark, s)?, name, s, -["=:"], s, -alts, -".". + @mark: ["@^-"]. + alts: alt+(-[";|"], s). + alt: term*(-",", s). + -term: factor; + option; + repeat0; + repeat1. + -factor: terminal; + nonterminal; + -"(", s, alts, -")", s. + repeat0: factor, -"*", s, sep?. + repeat1: factor, -"+", s, sep?. + option: factor, -"?", s. + sep: factor. + nonterminal: (mark, s)?, name, s. + + -terminal: literal; + charset. + literal: quoted; + encoded. + -quoted: (tmark, s)?, string. + + @name: namestart, namefollower*. + -namestart: ["_"; L]. +-namefollower: namestart; ["-.·‿⁀"; Nd; Mn]. + + @tmark: ["^-"]. + @string: -'"', dchar+, -'"', s; + -"'", schar+, -"'", s. + dchar: ~['"'; #a; #d]; + '"', -'"'. {all characters except line breaks; quotes must be doubled} + schar: ~["'"; #a; #d]; + "'", -"'". {all characters except line breaks; quotes must be doubled} + -encoded: (tmark, s)?, -"#", @hex, s. + hex: ["0"-"9"; "a"-"f"; "A"-"F"]+. + + -charset: inclusion; + exclusion. + inclusion: (tmark, s)?, set. + exclusion: (tmark, s)?, -"~", s, set. + -set: -"[", s, member*(-[";|"], s), -"]", s. + -member: literal; + range; + class. + range: from, s, -"-", s, to, s. + @from: character. + @to: character. + -character: -'"', dchar, -'"'; + -"'", schar, -"'"; + "#", hex. + class: code, s. + @code: capital, letter?. + -capital: ["A"-"Z"]. + -letter: ["a"-"z"]. diff --git a/tests/reference/ixml.xml b/tests/reference/ixml.xml new file mode 100644 index 00000000..933100d6 --- /dev/null +++ b/tests/reference/ixml.xml @@ -0,0 +1,485 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all characters except line breaks; quotes must be doubled + + + + + + + + + + + + + + all characters except line breaks; quotes must be doubled + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/syntax/catalog-as-grammar-tests.xml b/tests/syntax/catalog-as-grammar-tests.xml new file mode 100644 index 00000000..21274885 --- /dev/null +++ b/tests/syntax/catalog-as-grammar-tests.xml @@ -0,0 +1,339 @@ + + + +

Syntax tests provided by Steven Pemberton in December 2021.

+

All tests are negative (the file.ixml files do not conform + to the grammar for ixml).

+

Since these are all parsed against the grammar for ixml + grammars, the expected result could be given as + 'assert-not-a-grammar' as well as 'assert-not-a-sentence'. The + inputs are not sentences in the language defined by the grammar + for grammars, which means they are not ixml grammars.

+

Three catalogs are provided, using different formulations. This + one describes all the tests as grammar tests and relies on + the ixml grammar built into the processor.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/tests/syntax/catalog-as-instance-tests-ixml.xml b/tests/syntax/catalog-as-instance-tests-ixml.xml new file mode 100644 index 00000000..e72a3a45 --- /dev/null +++ b/tests/syntax/catalog-as-instance-tests-ixml.xml @@ -0,0 +1,168 @@ + + + +

Syntax tests provided by Steven Pemberton in December 2021.

+

All tests are negative (the file.ixml files do not conform + to the grammar for ixml).

+

Since these are all parsed against the grammar for ixml + grammars, the expected result could be given as + 'assert-not-a-grammar' as well as 'assert-not-a-sentence'. The + inputs are not sentences in the language defined by the grammar + for grammars, which means they are not ixml grammars.

+

Three catalogs are provided, using different formulations. This + one uses the ixml form of the ixml grammar and describes all the + tests as instance tests.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/tests/syntax/catalog-as-instance-tests-xml.xml b/tests/syntax/catalog-as-instance-tests-xml.xml new file mode 100644 index 00000000..43e94b03 --- /dev/null +++ b/tests/syntax/catalog-as-instance-tests-xml.xml @@ -0,0 +1,170 @@ + + + +

Syntax tests provided by Steven Pemberton in December 2021.

+

All tests are negative (the file.ixml files do not conform + to the grammar for ixml).

+

Since these are all parsed against the grammar for ixml + grammars, the expected result could be given as + 'assert-not-a-grammar' as well as 'assert-not-a-sentence'. The + inputs are not sentences in the language defined by the grammar + for grammars, which means they are not ixml grammars.

+

Three catalogs are provided, using different formulations. This + one uses the XML form of the ixml grammar and describes all the + tests as instance tests.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/tests/syntax/class-range.ixml b/tests/syntax/class-range.ixml new file mode 100644 index 00000000..56135313 --- /dev/null +++ b/tests/syntax/class-range.ixml @@ -0,0 +1,2 @@ +a: class-range. +class-range: [L-N]. diff --git a/tests/syntax/defn1.ixml b/tests/syntax/defn1.ixml new file mode 100644 index 00000000..da9e19cf --- /dev/null +++ b/tests/syntax/defn1.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: @".". {Not allowed for terminals} +d: . diff --git a/tests/syntax/defn10.ixml b/tests/syntax/defn10.ixml new file mode 100644 index 00000000..6d359f27 --- /dev/null +++ b/tests/syntax/defn10.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: -(a, b, c). {No marker allowed} +d: . diff --git a/tests/syntax/defn11.ixml b/tests/syntax/defn11.ixml new file mode 100644 index 00000000..07976d46 --- /dev/null +++ b/tests/syntax/defn11.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: ^(a, b, c). {No marker allowed} +d: . diff --git a/tests/syntax/defn2.ixml b/tests/syntax/defn2.ixml new file mode 100644 index 00000000..c5eae6b5 --- /dev/null +++ b/tests/syntax/defn2.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: @'.'. {Not allowed for terminals} +d: . diff --git a/tests/syntax/defn3.ixml b/tests/syntax/defn3.ixml new file mode 100644 index 00000000..224dc618 --- /dev/null +++ b/tests/syntax/defn3.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: 'unterminated. {unterminated string} +d: . diff --git a/tests/syntax/defn4.ixml b/tests/syntax/defn4.ixml new file mode 100644 index 00000000..18b4a351 --- /dev/null +++ b/tests/syntax/defn4.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: "unterminated. {unterminated string} +d: . diff --git a/tests/syntax/defn5.ixml b/tests/syntax/defn5.ixml new file mode 100644 index 00000000..dda32739 --- /dev/null +++ b/tests/syntax/defn5.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: @#9. {no @ for terminals} +d: . diff --git a/tests/syntax/defn6.ixml b/tests/syntax/defn6.ixml new file mode 100644 index 00000000..b5c04364 --- /dev/null +++ b/tests/syntax/defn6.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: @~["abc"]. {No @ allowed} +d: . diff --git a/tests/syntax/defn8.ixml b/tests/syntax/defn8.ixml new file mode 100644 index 00000000..bb08e921 --- /dev/null +++ b/tests/syntax/defn8.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: @["abc"]. {No @ allowed} +d: . diff --git a/tests/syntax/defn9.ixml b/tests/syntax/defn9.ixml new file mode 100644 index 00000000..2ece1130 --- /dev/null +++ b/tests/syntax/defn9.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: @(a, b, c). {No @ allowed} +d: . diff --git a/tests/syntax/elem1.inp b/tests/syntax/elem1.inp new file mode 100644 index 00000000..acbe86c7 --- /dev/null +++ b/tests/syntax/elem1.inp @@ -0,0 +1 @@ +abcd diff --git a/tests/syntax/elem1.ixml b/tests/syntax/elem1.ixml new file mode 100644 index 00000000..99231113 --- /dev/null +++ b/tests/syntax/elem1.ixml @@ -0,0 +1,4 @@ +a: "a", b, c. +b: "b", c, d. +c: "c", []. {it should block here, since nothing matches} +d: "d". diff --git a/tests/syntax/elem2.ixml b/tests/syntax/elem2.ixml new file mode 100644 index 00000000..b9449a92 --- /dev/null +++ b/tests/syntax/elem2.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: [1]. {illegal} +d: . diff --git a/tests/syntax/elem3.ixml b/tests/syntax/elem3.ixml new file mode 100644 index 00000000..1e2c0fa7 --- /dev/null +++ b/tests/syntax/elem3.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: ["a"-""]. {Must be length 1} +d: . diff --git a/tests/syntax/elem4.ixml b/tests/syntax/elem4.ixml new file mode 100644 index 00000000..c2b2db09 --- /dev/null +++ b/tests/syntax/elem4.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: ["a"-"dd"]. {Must be length 1} +d: . diff --git a/tests/syntax/elem5.ixml b/tests/syntax/elem5.ixml new file mode 100644 index 00000000..2bbcd111 --- /dev/null +++ b/tests/syntax/elem5.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: [""-"d"]. {Must be length 1} +d: . diff --git a/tests/syntax/elem6.ixml b/tests/syntax/elem6.ixml new file mode 100644 index 00000000..875fc398 --- /dev/null +++ b/tests/syntax/elem6.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: ["aa"-"d"]. {Must be length 1} +d: . diff --git a/tests/syntax/elem7.ixml b/tests/syntax/elem7.ixml new file mode 100644 index 00000000..8e04f9b4 --- /dev/null +++ b/tests/syntax/elem7.ixml @@ -0,0 +1,4 @@ +a: b, c. +b: c, d. +c: ["a".."d"]. {Not ..} +d: . diff --git a/tests/syntax/empty-string.ixml b/tests/syntax/empty-string.ixml new file mode 100644 index 00000000..0f71b7b4 --- /dev/null +++ b/tests/syntax/empty-string.ixml @@ -0,0 +1,4 @@ +a: b, c, d. +b: "b". +c: "" {not allowed}. +d: "d". diff --git a/tests/syntax/expr10.ixml b/tests/syntax/expr10.ixml new file mode 100644 index 00000000..f8565ced --- /dev/null +++ b/tests/syntax/expr10.ixml @@ -0,0 +1,2 @@ +expr: factor+("*"; "/"). +term: \ No newline at end of file diff --git a/tests/syntax/hex2.ixml b/tests/syntax/hex2.ixml new file mode 100644 index 00000000..659ed19c --- /dev/null +++ b/tests/syntax/hex2.ixml @@ -0,0 +1 @@ +hex: "a", #, "b". diff --git a/tests/syntax/illegal-class.ixml b/tests/syntax/illegal-class.ixml new file mode 100644 index 00000000..531844d8 --- /dev/null +++ b/tests/syntax/illegal-class.ixml @@ -0,0 +1,2 @@ +a: class. +class: [Xx]. {Non-existent class} diff --git a/tests/syntax/name-with-spaces.ixml b/tests/syntax/name-with-spaces.ixml new file mode 100644 index 00000000..1d13f166 --- /dev/null +++ b/tests/syntax/name-with-spaces.ixml @@ -0,0 +1,3 @@ +a b: b c. {No spaces in names} +b: . +c: . diff --git a/tests/syntax/rule.ixml b/tests/syntax/rule.ixml new file mode 100644 index 00000000..07dd6a10 --- /dev/null +++ b/tests/syntax/rule.ixml @@ -0,0 +1,3 @@ +a: b, c. +b . {missing colon} +c: . diff --git a/tests/syntax/rule1.ixml b/tests/syntax/rule1.ixml new file mode 100644 index 00000000..167e8a1f --- /dev/null +++ b/tests/syntax/rule1.ixml @@ -0,0 +1,3 @@ +a: b, c. +b: b. c. {extra .} +c: . diff --git a/tests/syntax/rule10.ixml b/tests/syntax/rule10.ixml new file mode 100644 index 00000000..e2f5c847 --- /dev/null +++ b/tests/syntax/rule10.ixml @@ -0,0 +1,2 @@ +a: "(" num ")". {missing commas} +num: "1". diff --git a/tests/syntax/rule2.ixml b/tests/syntax/rule2.ixml new file mode 100644 index 00000000..858a1e64 --- /dev/null +++ b/tests/syntax/rule2.ixml @@ -0,0 +1,3 @@ +a: b, c. +a: b, c. {double defined} + diff --git a/tests/syntax/rule3.ixml b/tests/syntax/rule3.ixml new file mode 100644 index 00000000..a7d62838 --- /dev/null +++ b/tests/syntax/rule3.ixml @@ -0,0 +1,3 @@ +a: b, c. +b: b, c, . {spurious comma} +c: ".". diff --git a/tests/syntax/rule4.ixml b/tests/syntax/rule4.ixml new file mode 100644 index 00000000..6fc7a45b --- /dev/null +++ b/tests/syntax/rule4.ixml @@ -0,0 +1,3 @@ +a: b, c. +b: b, c . +c: ".") {Rule ends wrong} diff --git a/tests/syntax/rule5.ixml b/tests/syntax/rule5.ixml new file mode 100644 index 00000000..f5e0f315 --- /dev/null +++ b/tests/syntax/rule5.ixml @@ -0,0 +1,3 @@ +a: b, c. +$b: b, c. {illegal name} +c: ".". diff --git a/tests/syntax/rule6.ixml b/tests/syntax/rule6.ixml new file mode 100644 index 00000000..2125a456 --- /dev/null +++ b/tests/syntax/rule6.ixml @@ -0,0 +1,3 @@ +a: b, c. +b:: b, c. {double colon} +c: ".". diff --git a/tests/syntax/rule7.ixml b/tests/syntax/rule7.ixml new file mode 100644 index 00000000..818e291f --- /dev/null +++ b/tests/syntax/rule7.ixml @@ -0,0 +1,3 @@ +a: b, c. +^^b: b, c. {double marker} +c: ".". diff --git a/tests/syntax/rule8.ixml b/tests/syntax/rule8.ixml new file mode 100644 index 00000000..2a71ac67 --- /dev/null +++ b/tests/syntax/rule8.ixml @@ -0,0 +1 @@ +a: "", "" diff --git a/tests/syntax/rule9.ixml b/tests/syntax/rule9.ixml new file mode 100644 index 00000000..2592128b --- /dev/null +++ b/tests/syntax/rule9.ixml @@ -0,0 +1,4 @@ +a: 1, b, c. {illegal terminal} +b: . +c: . + diff --git a/tests/syntax/unterminated-comment.ixml b/tests/syntax/unterminated-comment.ixml new file mode 100644 index 00000000..1c434245 --- /dev/null +++ b/tests/syntax/unterminated-comment.ixml @@ -0,0 +1,3 @@ +a: b. {This is an unterminated comment +b: c. +c: . diff --git a/tests/syntax/unterminated-comment1.ixml b/tests/syntax/unterminated-comment1.ixml new file mode 100644 index 00000000..e2f881c0 --- /dev/null +++ b/tests/syntax/unterminated-comment1.ixml @@ -0,0 +1,4 @@ +a: b. {This is an unterminated comment +{with a nested comment} +b: c. +c: . diff --git a/tests/syntax/unterminated-comment2.ixml b/tests/syntax/unterminated-comment2.ixml new file mode 100644 index 00000000..9e3db3b1 --- /dev/null +++ b/tests/syntax/unterminated-comment2.ixml @@ -0,0 +1,5 @@ +a: b. {This is an unterminated comment + +b: c. +{with a nested unterminated comment +c: . diff --git a/tests/syntax/unused-rule.ixml b/tests/syntax/unused-rule.ixml new file mode 100644 index 00000000..00a49efb --- /dev/null +++ b/tests/syntax/unused-rule.ixml @@ -0,0 +1,6 @@ +a: b, undefined. +b: c. +c: . +unused: e. +e: f. +f: . diff --git a/tests/syntax/unused-rules.ixml b/tests/syntax/unused-rules.ixml new file mode 100644 index 00000000..c44422ba --- /dev/null +++ b/tests/syntax/unused-rules.ixml @@ -0,0 +1,7 @@ +a: b, undefined1. +b: c. +c: undefined2. +unused1: e. +e: f. +unused2: . +f: . diff --git a/tests/test-catalog.xml b/tests/test-catalog.xml new file mode 100644 index 00000000..f1712c82 --- /dev/null +++ b/tests/test-catalog.xml @@ -0,0 +1,31 @@ + + + +

Top-level catalog for tests provided by + Steven Pemberton in December 2021.

+

There are two sub-groups, one general + and one focused on syntax issues.

+
+ + + + + + + + + + + + +
diff --git a/tests/tools/href-check.xsl b/tests/tools/href-check.xsl new file mode 100644 index 00000000..fe9be99b --- /dev/null +++ b/tests/tools/href-check.xsl @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + {@href}{$NL} + + + + + + + Missing test-set-ref: {resolve-uri(@href, base-uri(.))}{$NL} + + + + + + + {@href}{$NL} + + + Missing {local-name(.)}: {resolve-uri(@href, base-uri(.))}{$NL} + + + + + + {@href}{$NL} + + + Missing {local-name(.)}: {resolve-uri(@href, base-uri(.))}{$NL} + + + + + + + + + +