UpstandingHackers · pesco · Nov 24, 2019 · Nov 24, 2019 · Nov 24, 2019 · Nov 24, 2019
diff --git a/examples/SConscript b/examples/SConscript
@@ -6,8 +6,9 @@ example = env.Clone()
 example.Append(LIBS="hammer", LIBPATH="../src")
 
 dns = example.Program('dns', ['dns.c', 'rr.c', 'dns_common.c'])
+ttuser = example.Program('ttuser', 'ttuser.c')
 base64 = example.Program('base64', 'base64.c')
 base64_sem1 = example.Program('base64_sem1', 'base64_sem1.c')
 base64_sem2 = example.Program('base64_sem2', 'base64_sem2.c')
 ties = example.Program('ties', ['ties.c', 'grammar.c'])
-env.Alias("examples", [dns, base64, base64_sem1, base64_sem2, ties])
+env.Alias("examples", [dns, ttuser, base64, base64_sem1, base64_sem2, ties])
diff --git a/examples/ttuser.c b/examples/ttuser.c
@@ -0,0 +1,140 @@
+/*
+ * Example parser that demonstrates the use of user-defined token types.
+ *
+ * Note the custom printer function that hooks into h_pprint().
+ */
+
+#include "../src/hammer.h"
+#include "../src/glue.h"
+
+
+/*
+ * custom tokens
+ */
+
+HTokenType TT_SUBJ, TT_PRED, TT_OBJ, TT_ADJ, TT_ADVC;
+
+void
+pprint(FILE *stream, const HParsedToken *tok, int indent, int delta)
+{
+	/* 
+	 * Pretty-printer rules:
+	 *
+	 *  - Output 'indent' spaces after every newline you produce.
+	 *  - Do not add indent on the first line of output.
+	 *  - Do not add a trailing newline.
+	 *  - Indent sub-objects by adding 'delta' to 'indent'.
+	 */
+
+	if (((HParsedToken *)tok->user)->token_type == TT_SEQUENCE)
+		fprintf(stream, "\n%*s", indent, "");
+	h_pprint(stream, tok->user, indent, delta);
+}
+
+/* XXX define umamb_sub as well */
+
+void
+init(void)
+{
+	TT_SUBJ = h_allocate_token_new("subject", NULL, pprint);
+	TT_PRED = h_allocate_token_new("predicate", NULL, pprint);
+	TT_OBJ  = h_allocate_token_new("object", NULL, pprint);
+	TT_ADJ  = h_allocate_token_new("adjective", NULL, pprint);
+	TT_ADVC = h_allocate_token_new("adverbial clause", NULL, pprint);
+}
+
+
+/*
+ * semantic actions
+ *
+ * Normally these would be more interesting, but for this example, we just wrap
+ * our tokens in their intended types.
+ */
+HParsedToken *act_subj(const HParseResult *p, void *u) {
+	return H_MAKE(SUBJ, (void *)p->ast);
+}
+HParsedToken *act_pred(const HParseResult *p, void *u) {
+	return H_MAKE(PRED, (void *)p->ast);
+}
+HParsedToken *act_obj(const HParseResult *p, void *u) {
+	return H_MAKE(OBJ, (void *)p->ast);
+}
+HParsedToken *act_adj(const HParseResult *p, void *u) {
+	return H_MAKE(ADJ, (void *)p->ast);
+}
+HParsedToken *act_advc(const HParseResult *p, void *u) {
+	return H_MAKE(ADVC, (void *)p->ast);
+}
+
+
+/*
+ * grammar
+ */
+
+HParser *
+build_parser(void)
+{
+	/* words */
+	#define W(X)	h_whitespace(h_literal(#X))
+	H_RULE(art,	h_choice(W(a), W(the), NULL));
+	H_RULE(noun,	h_choice(W(cat), W(dog), W(fox), W(tiger), W(lion),
+			    W(bear), W(fence), W(tree), W(car), W(cow), NULL));
+	H_RULE(verb,	h_choice(W(eats), W(jumps), W(falls), NULL));
+	H_ARULE(adj,	h_choice(W(quick), W(slow), W(happy), W(lazy), W(cyan),
+			    W(magenta), W(yellow), W(black), W(brown), NULL));
+	H_RULE(adverb,	h_choice(W(with), W(over), W(after), NULL));
+	#undef W
+
+	/* phrases */
+	H_RULE(nphrase,	h_sequence(art, h_many(adj), noun, NULL));
+
+	/* sentence structure */
+	H_ARULE(subj,	nphrase);
+	H_ARULE(pred,	verb);
+	H_ARULE(obj,	nphrase);
+	H_ARULE(advc,	h_sequence(adverb, nphrase, NULL));
+	H_RULE(sentnc,	h_sequence(subj, pred,
+			    h_optional(obj), h_optional(advc), NULL));
+
+	return sentnc;
+}
+
+
+/*
+ * main routine: read, parse, print
+ *
+ * input e.g.:
+ * "the quick brown fox jumps the fence with a cyan lion"
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+int
+main(int argc, char **argv)
+{
+	uint8_t input[1024];
+	size_t sz;
+	const HParser *parser;
+	const HParseResult *result;
+
+	init();
+	parser = build_parser();
+
+	sz = fread(input, 1, sizeof(input), stdin);
+	if (!feof(stdin)) {
+		fprintf(stderr, "too much input\n");
+		return 1;
+	}
+
+	result = h_parse(parser, input, sz);
+	if (!result) {
+		fprintf(stderr, "no parse\n");
+		return 1;
+	}
+
+        h_pprintln(stdout, result->ast);
+        fprintf(stderr, "consumed %" PRId64 "/%zu bytes.\n",
+	    result->bit_length / 8, sz);
+        return 0;
+}
diff --git a/src/hammer.h b/src/hammer.h
@@ -728,10 +728,22 @@ HAMMER_FN_DECL(void, h_parse_result_free, HParseResult *result);
  */
 char* h_write_result_unamb(const HParsedToken* tok);
 /**
- * Format token to the given output stream. Indent starting at
- * [indent] spaces, with [delta] spaces between levels.
+ * Format token to the given output stream. Indent starting at [indent] spaces,
+ * with [delta] spaces between levels.
+ *
+ * Note: This function does not print a trailing newline. It also does not
+ * print any spaces to indent the initial line of output. This makes it
+ * suitable for recursive use in the condensed output of larger structures.
  */
 void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta);
+/**
+ * Format token to the given output. Print a trailing newline.
+ *
+ * This function assumes an initial indentation of 0 and uses 2 spaces between
+ * indentation levels. It is equivalent to 'h_pprint(stream, tok, 0, 2)'
+ * followed by 'fputc('\n', stream)' and is provided for convenience.
+ */
+void h_pprintln(FILE* stream, const HParsedToken* tok);
 
 /**
  * Build parse tables for the given parser backend. See the
@@ -795,7 +807,8 @@ HTokenType h_allocate_token_type(const char* name);
 /// Allocate a new token type with an unambiguous print function.
 HTokenType h_allocate_token_new(
     const char* name,
-    void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf));
+    void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf),
+    void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta));
 
 /// Get the token type associated with name. Returns -1 if name is unkown
 HTokenType h_get_token_type_number(const char* name);

diff --git a/src/internal.h b/src/internal.h
@@ -428,6 +428,7 @@ typedef struct HTTEntry_ {
   const char* name;
   HTokenType value;
   void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf);
+  void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta);
 } HTTEntry;
 
 const HTTEntry* h_get_token_type_entry(HTokenType token_type);

diff --git a/src/pprint.c b/src/pprint.c
@@ -23,6 +23,7 @@
 #include "internal.h"
 #include <stdlib.h>
 #include <inttypes.h>
+#include <ctype.h>
 
 typedef struct pp_state {
   int delta;
@@ -31,54 +32,66 @@ typedef struct pp_state {
 } pp_state_t;
 
 void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta) {
+  if (tok == NULL) {
+    fprintf(stream, "(null)");
+    return;
+  }
   switch (tok->token_type) {
   case TT_NONE:
-    fprintf(stream, "%*snull\n", indent, "");
+    fprintf(stream, "none");
     break;
   case TT_BYTES:
-    if (tok->bytes.len == 0)
-      fprintf(stream, "%*s<>\n", indent, "");
-    else {
-      fprintf(stream, "%*s", indent, "");
-      for (size_t i = 0; i < tok->bytes.len; i++) {
-        fprintf(stream,
-                "%c%02hhx",
-                (i == 0) ? '<' : '.',
-                tok->bytes.token[i]);
-      }
-      fprintf(stream, ">\n");
+    fprintf(stream, "\"");
+    for (size_t i = 0; i < tok->bytes.len; i++) {
+      uint8_t c = tok->bytes.token[i];
+      if (isprint(c))
+        fputc(c, stream);
+      else
+        fprintf(stream, "\\%03hho", c);
     }
+    fprintf(stream, "\"");
     break;
   case TT_SINT:
     if (tok->sint < 0)
-      fprintf(stream, "%*ss -%#" PRIx64 "\n", indent, "", -tok->sint);
+      fprintf(stream, "-%#" PRIx64, -tok->sint);
     else
-      fprintf(stream, "%*ss %#" PRIx64 "\n", indent, "", tok->sint);
-
+      fprintf(stream, "+%#" PRIx64, tok->sint);
     break;
   case TT_UINT:
-    fprintf(stream, "%*su %#" PRIx64 "\n", indent, "", tok->uint);
+    fprintf(stream, "%#" PRIx64, tok->uint);
     break;
-  case TT_SEQUENCE: {
-    fprintf(stream, "%*s[\n", indent, "");
-    for (size_t i = 0; i < tok->seq->used; i++) {
-      h_pprint(stream, tok->seq->elements[i], indent + delta, delta);
+  case TT_SEQUENCE:
+    if (tok->seq->used == 0)
+      fprintf(stream, "[ ]");
+    else {
+      fprintf(stream, "[%*s", delta - 1, "");
+      for (size_t i = 0; i < tok->seq->used; i++) {
+	if (i > 0) fprintf(stream, "\n%*s,%*s", indent, "", delta - 1, "");
+        h_pprint(stream, tok->seq->elements[i], indent + delta, delta);
+      }
+      if (tok->seq->used > 2)
+        fprintf(stream, "\n%*s]", indent, "");
+      else
+        fprintf(stream, " ]");
     }
-    fprintf(stream, "%*s]\n", indent, "");
-  }
-    break;
-  case TT_USER:
-    fprintf(stream, "%*sUSER:%s\n", indent, "", h_get_token_type_name(tok->token_type));
     break;
   default:
-    if(tok->token_type > TT_USER) {
-      fprintf(stream, "%*sUSER:%s %d\n", indent, "", h_get_token_type_name(tok->token_type), tok->token_type-TT_USER);
+    if(tok->token_type >= TT_USER) {
+      const HTTEntry *e = h_get_token_type_entry(tok->token_type);
+      fprintf(stream, "USER %d (%s) ", e->value - TT_USER, e->name);
+      if (e->pprint)
+        e->pprint(stream, tok, indent, delta);
     } else {
       assert_message(0, "Should not reach here.");
     }
   }
 }
 
+void h_pprintln(FILE* stream, const HParsedToken* tok) {
+  h_pprint(stream, tok, 0, 2);
+  fputc('\n', stream);
+}
+
 
 struct result_buf {
   char* output;

diff --git a/src/registry.c b/src/registry.c
@@ -54,12 +54,14 @@ static void default_unamb_sub(const HParsedToken* tok,
 
 HTokenType h_allocate_token_new(
     const char* name,
-    void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf)) {
+    void (*unamb_sub)(const HParsedToken *tok, struct result_buf *buf),
+    void (*pprint)(FILE* stream, const HParsedToken* tok, int indent, int delta)) {
   HTTEntry* new_entry = h_alloc(&system_allocator, sizeof(*new_entry));
   assert(new_entry != NULL);
   new_entry->name = name;
   new_entry->value = 0;
-  new_entry->unamb_sub = unamb_sub;
+  new_entry->unamb_sub = unamb_sub ? unamb_sub : default_unamb_sub;
+  new_entry->pprint = pprint;
   HTTEntry* probe = *(HTTEntry**)tsearch(new_entry, &tt_registry, compare_entries);
   if (probe->value != 0) {
     // Token type already exists...
@@ -86,7 +88,7 @@ HTokenType h_allocate_token_new(
   }
 }
 HTokenType h_allocate_token_type(const char* name) {
-  return h_allocate_token_new(name, default_unamb_sub);
+  return h_allocate_token_new(name, NULL, NULL);
 }
 HTokenType h_get_token_type_number(const char* name) {
   HTTEntry e;