From 21b2e37847f4cd30c00c3ffa9ec0ff171d3eca35 Mon Sep 17 00:00:00 2001 From: "Fabian N.C. van 't Hooft" Date: Tue, 18 Nov 2025 20:46:08 +0100 Subject: [PATCH 1/4] [feat] csv2 -path and 2xml -format These tools are super and really helpful. The csv2 tool was adapted to add two parameters: -path = instead of /file/record -n = namespace for first item in path The 2xml tool was adapted with one new parameter: -f = format the output file This should facilite scripting and remove additional tooling such as xmllint --format - etc. All changes are meant to be minimal and backwards compatible. --- 2xml.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ csv2.c | 27 ++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/2xml.c b/2xml.c index fb0836b..07506be 100644 --- a/2xml.c +++ b/2xml.c @@ -28,6 +28,9 @@ int do_html; int in_tag = 0; +int do_format = 0; +int indent_level = 0; +int text_was_printed = 0; /* New: Flag to track if text was just output */ struct node { @@ -37,6 +40,15 @@ struct node struct node *root = NULL; +static void print_indent() +{ + if (!do_format) return; + int i; + for (i = 0; i < indent_level; i++) { + fputs(" ", stdout); + } +} + static void finish_tag() { if (in_tag) { @@ -63,6 +75,12 @@ static void enter(const char *name) finish_tag(); + if (do_format && root != NULL) { + putchar('\n'); + } + print_indent(); + + switch (name[0]) { case '!': if ('\0' != name[1]) { @@ -84,10 +102,19 @@ static void enter(const char *name) in_tag = 1; break; } + + if (do_format) { + switch(name[0]) { + case '@': case '?': case '!': break; + default: indent_level++; + } + } } static void leave(const char *name) { + int was_in_tag = in_tag; + switch (name[0]) { case '@': assert(in_tag); @@ -100,16 +127,24 @@ static void leave(const char *name) fputs("?>",stdout); break; default: - if (!do_html && in_tag) + if (do_format) indent_level--; + + if (!do_html && was_in_tag) fputs("/>",stdout); else { const htmlElemDesc *elem = NULL; if (do_html) elem = htmlTagLookup((xmlChar *) name); finish_tag(); if (NULL == elem || (!elem->endTag && !elem->empty)) { + /* Modified: Check text_was_printed flag before adding newline */ + if (do_format && !was_in_tag && !text_was_printed) { + putchar('\n'); + print_indent(); + } fputs("'); + text_was_printed = 0; /* Reset flag after any closing tag */ } } in_tag = 0; @@ -145,6 +180,8 @@ static void chars(const char *stuff,const char *context) #endif } } + /* New: Set flag if we just printed element text (not attribute text) */ + if (context[0] != '@') text_was_printed = 1; } static void release(struct node **ptr) @@ -214,13 +251,19 @@ int main(int argc,char *argv[]) const char *name = strrchr(argv[0],'/'); if (NULL == name) name = argv[0]; else ++name; - if (1 == argc && !strcmp(name,"2html")) do_html = 1; - else if (1 == argc && !strcmp(name,"2xml")) do_html = 0; - else { - fputs("usage: [2xml|2html] < in > [xml|html]\n",stderr); + if (!strcmp(name,"2html")) do_html = 1; + + int arg; + while (-1 != (arg = getopt(argc, argv, "f"))) switch(arg) { + case 'f': + do_format = 1; + break; + case '?': + fputs("usage: [2xml|2html] [-f] < in > [xml|html]\n",stderr); return 2; } + while ((num = read(0,len + buffer,alloc - len)) > 0) { char *end = buffer + len + num,*ptr = buffer,*eol; while ((eol = memchr(ptr,'\n',end - ptr))) { @@ -241,4 +284,4 @@ int main(int argc,char *argv[]) release(&root); putchar('\n'); return 0; -} +} \ No newline at end of file diff --git a/csv2.c b/csv2.c index b045e9f..a1d478b 100644 --- a/csv2.c +++ b/csv2.c @@ -7,6 +7,8 @@ int do_first = 0; const char *quote = ""; const char *delimiter = ","; +const char *path_prefix = "file/record"; +const char *namespace_uri = NULL; // New: To store the namespace URI int num_fields = 0; int recno = 0; @@ -53,7 +55,7 @@ void field(int num,const char *begin,const char *end) { *ptr++ = '\0'; field_names[num] = name; } else { - fputs("/file/record/",stdout); + fprintf(stdout,"/%s/",path_prefix); if (num < num_fields && NULL != field_names[num]) fputs(field_names[num],stdout); else @@ -65,8 +67,8 @@ void field(int num,const char *begin,const char *end) { void line(const char *l) { int num = 0; if (!do_first) { - fputs("/file/record\n",stdout); - printf("/file/record/@num=%d\n",recno++); + fprintf(stdout,"/%s\n",path_prefix); + fprintf(stdout,"/%s/@num=%d\n",path_prefix, recno++); } for (;;) { if (NULL != strchr(quote,*l)) { @@ -90,12 +92,16 @@ int main(int argc,char *argv[]) { int arg,num,alloc,len = 0; char *buffer = malloc(alloc = 4096); - while (EOF != (arg = getopt(argc,argv,"fq:d:"))) switch (arg) { + // Modified: Added 'n:' to getopt string + while (EOF != (arg = getopt(argc,argv,"fq:d:p:n:"))) switch (arg) { case 'f': ++do_first; break; case 'q': quote = optarg; break; case 'd': delimiter = optarg; break; + case 'p': path_prefix = optarg; break; + case 'n': namespace_uri = optarg; break; // New: Handle the -n option case '?': - fputs("usage: csv2 [-f] [-q quote] [-d comma] < csv > out\n", + // Modified: Updated usage message + fputs("usage: csv2 [-f] [-q quote] [-d comma] [-p path] [-n namespace] < csv > out\n", stderr); return 2; } @@ -105,6 +111,17 @@ int main(int argc,char *argv[]) { return 2; } + // New: Logic to print the namespace attribute line + if (namespace_uri != NULL) { + // Find the length of the first component of the path_prefix + size_t root_len = strcspn(path_prefix, "/"); + if (root_len > 0) { + // Print the attribute line for the root element + fprintf(stdout, "/%.*s/@xmlns=%s\n", (int)root_len, path_prefix, namespace_uri); + } + } + + while ((num = read(0,len + buffer,alloc - len)) > 0) { char *end = buffer + len + num,*ptr = buffer,*eol; while ((eol = memchr(ptr,'\n',end - ptr))) { From 0c7b48b0f6f43c3bce61577376045d16310d1d8e Mon Sep 17 00:00:00 2001 From: Lorenzo Ancora <34890309+LorenzoAncora@users.noreply.github.com> Date: Sat, 14 Jul 2018 23:12:27 +0200 Subject: [PATCH 2/4] HTML documentation to MarkDown documentation The original documentation is disappearing from Internet, along with the author's website. This is the manual conversion of the original documentation from HTML to MarkDown, so it will be available in the future along with the utilities. The text has been manipulated so as to eliminate unnecessary external references and appear correctly on GitHub and similar. --- doc/intro.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 doc/intro.md diff --git a/doc/intro.md b/doc/intro.md new file mode 100644 index 0000000..4c9b314 --- /dev/null +++ b/doc/intro.md @@ -0,0 +1,41 @@ +# XML/Unix Processing Tools + +## Introduction + +These tools are used to convert XML and HTML to and from a line-oriented format more amenable to processing by classic Unix pipeline processing tools, like `grep`, `sed`, `awk`, `cut`, shell scripts, and so forth. + +Documentation (reference.md) is available, and examples (examples.md) are illustrative. + +### Installation + +1. Fetch and install the `gnome-xml` library (`libxml`).

+I'm using version 1.8.6. Other versions might or might not work. +Make sure `xml-config` is on your path. + +2. Fetch and unpack the source tarball for my tools from this repository.

+Look for a file named `xml2-version.tar.gz`. + +3. Run `make`.

You should now have several binaries: `xml2`, `2xml`, `csv2`, `2csv`.
+Symbolic links are used to offer alternative names: `html2` and `2html`. + +3. Copy the binaries and links somewhere. + +### Limitations + +- Namespace support is absent. + +- Whitespace isn't always preserved, and the rules for preserving and generating whitespace are complex. + +- It's possible to preserve all whitespace, but the resulting flat files are big and ugly. In most cases, whitespace is meaningless, used only to make the XML human-readable. Even in HTML, whitespace is sometimes significant and sometimes not, with no easy way to tell which is which. + +- XML is fundamentally hierarchical, not record-oriented. + +- The usefulness of record-oriented Unix tools to this domain will always be limited to simple operations like basic search and replacement, no matter how many syntactic transformations we make. More complex processing requires XML-specific tools like XSLT. + +- The transformation is complex.

+The syntax used by these tools is relatively intuitive, but difficult to describe precisely. (My own documentation relies only on examples.) This makes it difficult to formally reason about data, so subtle errors are easy to make. + +--- + +Author: *Dan Egnor* (ofb.net/~egnor)
+Converted manually by *Lorenzo L. Ancora*, from HTML to MarkDown. All legal rights remain with the original author and this documentation is distributed non-profit. From 4000aa771833fce122546fb58e7305095c88aa85 Mon Sep 17 00:00:00 2001 From: Lorenzo Ancora <34890309+LorenzoAncora@users.noreply.github.com> Date: Sat, 14 Jul 2018 23:15:41 +0200 Subject: [PATCH 3/4] HTML documentation to MarkDown documentation The original documentation is disappearing from Internet, along with the author's website. This is the manual conversion of the original documentation from HTML to MarkDown, so it will be available in the future along with the utilities. The text has been manipulated so as to eliminate unnecessary external references and appear correctly on GitHub and similar. The sarcasm of the author has been preserved, as it should be. :-) --- doc/examples.md | 117 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 doc/examples.md diff --git a/doc/examples.md b/doc/examples.md new file mode 100644 index 0000000..ee2a397 --- /dev/null +++ b/doc/examples.md @@ -0,0 +1,117 @@ +# Examples + +Common GNU tools (`wget`, `grep`, ...) are assumed. + +## XML + +### Use the Slashdot backend. + + % wget -q -O - http://slashdot.org/slashdot.xml | xml2 + + /backslash/@xmlns:backslash=http://slashdot.org/backslash.dtd + /backslash/story/title=More on Athlon Overclocking + /backslash/story/url=http://slashdot.org/articles/00/03/04/1441248.shtml + /backslash/story/time=2000-03-05 03:40:47 + /backslash/story/author=Hemos + /backslash/story/department=better-faster-strong + /backslash/story/topic=amd + /backslash/story/comments=56 + /backslash/story/section=articles + /backslash/story/image=topicamd.gif + /backslash/story + /backslash/story/title=New Atari Jaguar Game Running $1,225 on eBay + /backslash/story/url=http://slashdot.org/articles/00/03/02/1430232.shtml + ... + +### Now, just the headlines. + + % wget -q -O - http://slashdot.org/slashdot.xml | xml2 | grep story/title= | cut -d= -f 2- + + More on Athlon Overclocking + New Atari Jaguar Game Running $1,225 on eBay + AT&T;'s Korn Shell Source Code Released + TheBench.org: Community Cartooning + OpenGL for Palm OS Environment + Banner Ads on Your Cell Phone + Burning Money on Open Source + Embedded OpenBSD Running the Stallion ePipe + Bezos Responds to Tim O'Reilly's Open Letter + Update on 'Blame Canada' and the Oscars + +### How big is the Red Hat 6.1 libxml RPM? + +For variety, we use awk rather than grep and cut: + + % wget -q -O - http://rpmfind.net/linux/RDF/redhat/6.1/i386/libxml-1.4.0-1.i386.rdf | xml2 | awk -F= '/RPM:Size/ {print $2}' + + 704399 + +### What is the melting point of silicon? + +More awkitude. Don't let your CPU get hotter than this! + + % wget -q -O - http://metalab.unc.edu/xml/examples/periodic_table/allelements.xml | xml2 | awk '/ATOM\/NAME=Silicon/,!/ATOM\//' | awk -F\= '/MELTING_POINT/ {print $2}' + + Kelvin + 1683 + +(1683ºK is 2570ºF, by the way.) + + +## HTML + + +### Fetch the Slashdot news page. + +You'll probably see some warnings. (Slashdot has some of the worst HTML I've ever seen...) + + % wget -q -O - http://slashdot.org/ | html2 + + /html/head/title=Slashdot:News for Nerds. Stuff that Matters. + /html/head= + /html= + /html/body/@bgcolor=#000000 + /html/body/@text=#000000 + /html/body/@link=#006666 + /html/body/@vlink=#000000 + /html/body= + /html/body/center/a/@href=http://209.207.224.220/redir.pl?1789 + /html/body/center/a/@target=_top + ... + +### Find all the links. + +If you find the warnings distracting, redirect the standard error of `html2` to `/dev/null`. + + % wget -q -O - http://slashdot.org/ | html2 | grep 'a/@href' | cut -d\= -f 2- | sort | uniq + + /about.shtml + /advertising.shtml + /article.pl?sid=99/03/31/0137221 + /article.pl?sid=99/04/25/1438249 + /article.pl?sid=99/04/27/0310247 + /article.pl?sid=99/04/29/0124247 + /article.pl?sid=99/08/24/1327256&mode;=thread + /awards.shtml + /cheesyportal.shtml + /code.shtml + ... + +### Change some colors. + +This pipeline uses both `html2` and `2html` to effect a round-trip. In the middle, `sed` applies a transformation, turning the background of every colored table on the page yellow. Yuck, huh? + + % wget -q -O - http://slashdot.org/ | html2 | sed 's|table/@bgcolor=\(.*\)$|table/@bgcolor=yellow|' | 2html > slashdot.html + % netscape slashdot.html + +### Strip JavaScript from a Geocities home page. + +Geocities uses JavaScript to create an annoying little brand popup in the corner of their members' home pages. Let's delete it. + + % wget -q -O - http://www.geocities.com/SiliconValley/Peaks/5957/xml.html | html2 | grep -vi '^[^=]*/script[/=]' | 2html > xml.html + % netscape xml.html + +--- + +Author: *Dan Egnor* (ofb.net/~egnor)
+Converted manually by *Lorenzo L. Ancora*, from HTML to MarkDown. All legal rights remain with the original author and this documentation is distributed non-profit. From 2b8e76aeb64160683e0bbe064aa9d7b4660ded13 Mon Sep 17 00:00:00 2001 From: Lorenzo Ancora <34890309+LorenzoAncora@users.noreply.github.com> Date: Sat, 14 Jul 2018 23:17:16 +0200 Subject: [PATCH 4/4] HTML documentation to MarkDown documentation The original documentation is disappearing from Internet, along with the author's website. This is the manual conversion of the original documentation from HTML to MarkDown, so it will be available in the future along with the utilities. The text has been manipulated so as to eliminate unnecessary external references and appear correctly on GitHub and similar. --- doc/reference.md | 175 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 doc/reference.md diff --git a/doc/reference.md b/doc/reference.md new file mode 100644 index 0000000..4d8d4b4 --- /dev/null +++ b/doc/reference.md @@ -0,0 +1,175 @@ +# XML/Unix Processing Tools Documentation + +## Usage + +There are six tools. They are all simple filters, reading information from standard input in one format and writing the same information to standard output in a different format. + +| Tool name | Input | Output | +|-----------|--------------|--------| +| xml2 | XML | Flat | +| html2 | HTML | Flat | +| csv2 | CSV | Flat | +| 2xml | Flat | XML | +| 2html | Flat | HTML | +| 2csv | Flat | CSV | + +The "Flat" format is specific to these tools. It is a syntax for representing structured markup in a way that makes it easy to process with line-oriented tools. The same format is used for HTML, XML, and CSV; in fact, you can think of `html2` as converting HTML to XHTML and running `xml2` on the result; likewise `2html` and `2xml`. + +CSV (comma-separated value) files are less expressive than XML or HTML (CSV has no hierarchy), so `xml2` | `2csv` is a lossy conversion. + +## File Format + +To use these tools effectively, it's important to understand the "Flat" format. Unfortunately, I'm lazy and sloppy; rather than provide a precise definition of the relationship between XML and "Flat", I will simply give you a pile of examples and hope you can generalize correctly. + +(Good luck!) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
XMLFlat equivalent
<thing/>/thing

<thing><subthing/></thing>/thing/subthing

<thing>stuff</thing>/thing=stuff

+ <thing>
+ + <subthing>substuff</subthing>
+ stuff
+
+ </thing>
+
+ /thing/subthing=substuff
+ /thing=stuff
+

+ <person>
+ + <name>Juan Doé</name>
+ <occupation>Zillionaire</occupation>
+ <pet>Dogcow</pet>
+ <address>
+ + 123 Camino Real
+ <city>El Dorado</city>
+ <state>AZ</state>
+ <zip>12345</zip>
+
+ </address>
+ <important/>
+
+ </person>
+
+ /person/name=Juan Doé
+ /person/occupation=Zillionaire
+ /person/pet=Dogcow
+ /person/address=123 Camino Real
+ /person/address/city=El Dorado
+ /person/address/state=AZ
+ /person/address/zip=12345
+ /person/important +

+ <collection>
+ + <group>
+ + <thing>stuff</thing>
+ <thing>stuff</thing>
+
+ </group>
+
+ </collection>
+
+ /collection/group/thing=stuff
+ /collection/group/thing
+ /collection/group/thing=stuff
+

+ <collection>
+ + <group>
+ + <thing>stuff</thing>
+
+ </group>
+ <group>
+ + <thing>stuff</thing>
+
+ </group>
+
+ </collection>
+
+ /collection/group/thing=stuff
+ /collection/group
+ /collection/group/thing=stuff
+

+ <thing>
+ + stuff
+
+ more stuff
+ &lt;other stuff&gt; +
+ </thing>
+
+ /thing=stuff
+ /thing=
+ /thing=more stuff
+ /thing=<other stuff>
+

<thing flag="value">stuff</thing> + /thing/@flag=value
+ /thing=stuff
+

+ <?processing instruction?>
+ <thing/> +
+ /?processing=instruction
+ /thing
+
+ +--- + +Author: *Dan Egnor* (ofb.net/~egnor)
+Converted manually by *Lorenzo L. Ancora*, from HTML to MarkDown. All legal rights remain with the original author and this documentation is distributed non-profit.