Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 49 additions & 6 deletions 2xml.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@

int do_html;
int in_tag = 0;
int do_format = 0;
int indent_level = 0;
int text_was_printed = 0; /* New: Flag to track if text was just output */

struct node
{
Expand All @@ -37,6 +40,15 @@ struct node

struct node *root = NULL;

static void print_indent()
{
if (!do_format) return;
int i;
for (i = 0; i < indent_level; i++) {
fputs(" ", stdout);
}
}

static void finish_tag()
{
if (in_tag) {
Expand All @@ -63,6 +75,12 @@ static void enter(const char *name)

finish_tag();

if (do_format && root != NULL) {
putchar('\n');
}
print_indent();


switch (name[0]) {
case '!':
if ('\0' != name[1]) {
Expand All @@ -84,10 +102,19 @@ static void enter(const char *name)
in_tag = 1;
break;
}

if (do_format) {
switch(name[0]) {
case '@': case '?': case '!': break;
default: indent_level++;
}
}
}

static void leave(const char *name)
{
int was_in_tag = in_tag;

switch (name[0]) {
case '@':
assert(in_tag);
Expand All @@ -100,16 +127,24 @@ static void leave(const char *name)
fputs("?>",stdout);
break;
default:
if (!do_html && in_tag)
if (do_format) indent_level--;

if (!do_html && was_in_tag)
fputs("/>",stdout);
else {
const htmlElemDesc *elem = NULL;
if (do_html) elem = htmlTagLookup((xmlChar *) name);
finish_tag();
if (NULL == elem || (!elem->endTag && !elem->empty)) {
/* Modified: Check text_was_printed flag before adding newline */
if (do_format && !was_in_tag && !text_was_printed) {
putchar('\n');
print_indent();
}
fputs("</",stdout);
fputs(name,stdout);
putchar('>');
text_was_printed = 0; /* Reset flag after any closing tag */
}
}
in_tag = 0;
Expand Down Expand Up @@ -145,6 +180,8 @@ static void chars(const char *stuff,const char *context)
#endif
}
}
/* New: Set flag if we just printed element text (not attribute text) */
if (context[0] != '@') text_was_printed = 1;
}

static void release(struct node **ptr)
Expand Down Expand Up @@ -214,13 +251,19 @@ int main(int argc,char *argv[])
const char *name = strrchr(argv[0],'/');
if (NULL == name) name = argv[0]; else ++name;

if (1 == argc && !strcmp(name,"2html")) do_html = 1;
else if (1 == argc && !strcmp(name,"2xml")) do_html = 0;
else {
fputs("usage: [2xml|2html] < in > [xml|html]\n",stderr);
if (!strcmp(name,"2html")) do_html = 1;

int arg;
while (-1 != (arg = getopt(argc, argv, "f"))) switch(arg) {
case 'f':
do_format = 1;
break;
case '?':
fputs("usage: [2xml|2html] [-f] < in > [xml|html]\n",stderr);
return 2;
}


while ((num = read(0,len + buffer,alloc - len)) > 0) {
char *end = buffer + len + num,*ptr = buffer,*eol;
while ((eol = memchr(ptr,'\n',end - ptr))) {
Expand All @@ -241,4 +284,4 @@ int main(int argc,char *argv[])
release(&root);
putchar('\n');
return 0;
}
}
27 changes: 22 additions & 5 deletions csv2.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
int do_first = 0;
const char *quote = "";
const char *delimiter = ",";
const char *path_prefix = "file/record";
const char *namespace_uri = NULL; // New: To store the namespace URI

int num_fields = 0;
int recno = 0;
Expand Down Expand Up @@ -53,7 +55,7 @@ void field(int num,const char *begin,const char *end) {
*ptr++ = '\0';
field_names[num] = name;
} else {
fputs("/file/record/",stdout);
fprintf(stdout,"/%s/",path_prefix);
if (num < num_fields && NULL != field_names[num])
fputs(field_names[num],stdout);
else
Expand All @@ -65,8 +67,8 @@ void field(int num,const char *begin,const char *end) {
void line(const char *l) {
int num = 0;
if (!do_first) {
fputs("/file/record\n",stdout);
printf("/file/record/@num=%d\n",recno++);
fprintf(stdout,"/%s\n",path_prefix);
fprintf(stdout,"/%s/@num=%d\n",path_prefix, recno++);
}
for (;;) {
if (NULL != strchr(quote,*l)) {
Expand All @@ -90,12 +92,16 @@ int main(int argc,char *argv[]) {
int arg,num,alloc,len = 0;
char *buffer = malloc(alloc = 4096);

while (EOF != (arg = getopt(argc,argv,"fq:d:"))) switch (arg) {
// Modified: Added 'n:' to getopt string
while (EOF != (arg = getopt(argc,argv,"fq:d:p:n:"))) switch (arg) {
case 'f': ++do_first; break;
case 'q': quote = optarg; break;
case 'd': delimiter = optarg; break;
case 'p': path_prefix = optarg; break;
case 'n': namespace_uri = optarg; break; // New: Handle the -n option
case '?':
fputs("usage: csv2 [-f] [-q quote] [-d comma] < csv > out\n",
// Modified: Updated usage message
fputs("usage: csv2 [-f] [-q quote] [-d comma] [-p path] [-n namespace] < csv > out\n",
stderr);
return 2;
}
Expand All @@ -105,6 +111,17 @@ int main(int argc,char *argv[]) {
return 2;
}

// New: Logic to print the namespace attribute line
if (namespace_uri != NULL) {
// Find the length of the first component of the path_prefix
size_t root_len = strcspn(path_prefix, "/");
if (root_len > 0) {
// Print the attribute line for the root element
fprintf(stdout, "/%.*s/@xmlns=%s\n", (int)root_len, path_prefix, namespace_uri);
}
}


while ((num = read(0,len + buffer,alloc - len)) > 0) {
char *end = buffer + len + num,*ptr = buffer,*eol;
while ((eol = memchr(ptr,'\n',end - ptr))) {
Expand Down
117 changes: 117 additions & 0 deletions doc/examples.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Examples

Common GNU tools (`wget`, `grep`, ...) are assumed.

## XML

### Use the Slashdot backend.

% wget -q -O - http://slashdot.org/slashdot.xml | xml2

/backslash/@xmlns:backslash=http://slashdot.org/backslash.dtd
/backslash/story/title=More on Athlon Overclocking
/backslash/story/url=http://slashdot.org/articles/00/03/04/1441248.shtml
/backslash/story/time=2000-03-05 03:40:47
/backslash/story/author=Hemos
/backslash/story/department=better-faster-strong
/backslash/story/topic=amd
/backslash/story/comments=56
/backslash/story/section=articles
/backslash/story/image=topicamd.gif
/backslash/story
/backslash/story/title=New Atari Jaguar Game Running $1,225 on eBay
/backslash/story/url=http://slashdot.org/articles/00/03/02/1430232.shtml
...

### Now, just the headlines.

% wget -q -O - http://slashdot.org/slashdot.xml | xml2 | grep story/title= | cut -d= -f 2-

More on Athlon Overclocking
New Atari Jaguar Game Running $1,225 on eBay
AT&T;'s Korn Shell Source Code Released
TheBench.org: Community Cartooning
OpenGL for Palm OS Environment
Banner Ads on Your Cell Phone
Burning Money on Open Source
Embedded OpenBSD Running the Stallion ePipe
Bezos Responds to Tim O'Reilly's Open Letter
Update on 'Blame Canada' and the Oscars

### How big is the Red Hat 6.1 libxml RPM?

For variety, we use awk rather than grep and cut:

% wget -q -O - http://rpmfind.net/linux/RDF/redhat/6.1/i386/libxml-1.4.0-1.i386.rdf | xml2 | awk -F= '/RPM:Size/ {print $2}'

704399

### What is the melting point of silicon?

<sup>More awkitude. Don't let your CPU get hotter than this!</sup>

% wget -q -O - http://metalab.unc.edu/xml/examples/periodic_table/allelements.xml | xml2 | awk '/ATOM\/NAME=Silicon/,!/ATOM\//' | awk -F\= '/MELTING_POINT/ {print $2}'

Kelvin
1683

<sup>(1683ºK is 2570ºF, by the way.)</sup>


## HTML


### Fetch the Slashdot news page.

You'll probably see some warnings. (Slashdot has some of the worst HTML I've ever seen...)

% wget -q -O - http://slashdot.org/ | html2

/html/head/title=Slashdot:News for Nerds. Stuff that Matters.
/html/head=
/html=
/html/body/@bgcolor=#000000
/html/body/@text=#000000
/html/body/@link=#006666
/html/body/@vlink=#000000
/html/body=
/html/body/center/a/@href=http://209.207.224.220/redir.pl?1789
/html/body/center/a/@target=_top
...

### Find all the links.

If you find the warnings distracting, redirect the standard error of `html2` to `/dev/null`.

% wget -q -O - http://slashdot.org/ | html2 | grep 'a/@href' | cut -d\= -f 2- | sort | uniq

/about.shtml
/advertising.shtml
/article.pl?sid=99/03/31/0137221
/article.pl?sid=99/04/25/1438249
/article.pl?sid=99/04/27/0310247
/article.pl?sid=99/04/29/0124247
/article.pl?sid=99/08/24/1327256&mode;=thread
/awards.shtml
/cheesyportal.shtml
/code.shtml
...

### Change some colors.

This pipeline uses both `html2` and `2html` to effect a round-trip. In the middle, `sed` applies a transformation, turning the background of every colored table on the page yellow. Yuck, huh?

% wget -q -O - http://slashdot.org/ | html2 | sed 's|table/@bgcolor=\(.*\)$|table/@bgcolor=yellow|' | 2html > slashdot.html
% netscape slashdot.html

### Strip JavaScript from a Geocities home page.

Geocities uses JavaScript to create an annoying little brand popup in the corner of their members' home pages. Let's delete it.

% wget -q -O - http://www.geocities.com/SiliconValley/Peaks/5957/xml.html | html2 | grep -vi '^[^=]*/script[/=]' | 2html > xml.html
% netscape xml.html

---

Author: *Dan Egnor* (ofb.net/~egnor)<br/>
Converted manually by *Lorenzo L. Ancora*, from HTML to MarkDown. All legal rights remain with the original author and this documentation is distributed non-profit.
41 changes: 41 additions & 0 deletions doc/intro.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# XML/Unix Processing Tools

## Introduction

These tools are used to convert XML and HTML to and from a line-oriented format more amenable to processing by classic Unix pipeline processing tools, like `grep`, `sed`, `awk`, `cut`, shell scripts, and so forth.

Documentation (reference.md) is available, and examples (examples.md) are illustrative.

### Installation

1. Fetch and install the `gnome-xml` library (`libxml`).<br/><br/>
I'm using version 1.8.6. Other versions might or might not work.
Make sure `xml-config` is on your path.

2. Fetch and unpack the source tarball for my tools from this repository.<br/><br/>
Look for a file named `xml2-version.tar.gz`.

3. Run `make`.<br/><br/>You should now have several binaries: `xml2`, `2xml`, `csv2`, `2csv`.<br/>
Symbolic links are used to offer alternative names: `html2` and `2html`.

3. Copy the binaries and links somewhere.

### Limitations

- Namespace support is absent.

- Whitespace isn't always preserved, and the rules for preserving and generating whitespace are complex.

- It's possible to preserve all whitespace, but the resulting flat files are big and ugly. In most cases, whitespace is meaningless, used only to make the XML human-readable. Even in HTML, whitespace is sometimes significant and sometimes not, with no easy way to tell which is which.

- XML is fundamentally hierarchical, not record-oriented.

- The usefulness of record-oriented Unix tools to this domain will always be limited to simple operations like basic search and replacement, no matter how many syntactic transformations we make. More complex processing requires XML-specific tools like XSLT.

- The transformation is complex.<br/><br/>
The syntax used by these tools is relatively intuitive, but difficult to describe precisely. (My own documentation relies only on examples.) This makes it difficult to formally reason about data, so subtle errors are easy to make.

---

Author: *Dan Egnor* (ofb.net/~egnor)<br/>
Converted manually by *Lorenzo L. Ancora*, from HTML to MarkDown. All legal rights remain with the original author and this documentation is distributed non-profit.
Loading