[PATCH][NIH] nih_parse parsing framework

Casey Dahlin cdahlin at redhat.com
Wed Feb 10 06:18:44 GMT 2010


Well, after an exahusting series of rewrites and redesigns, the nih_parse
framework is finally here!

Basically this consists of a generic PEG
(http://en.wikipedia.org/wiki/Parsing_expression_grammar) parser that accepts a
grammar specification as input. As of right now that specification is given by
defining a few static struct constants (see the testcase for examples). I'll be
writing a utility to generate parsers from a DSL shortly.

The code is rougher than I'd like, mostly because the algorithm is inherently
recursive, and the code isn't recursive at all. This is done via some goto
magic that implements a sort of pseudo-call-and-return so the code can be
written like its recursive but use a simulated nih_alloc'd stack. The weak
point is that it can deal with "A calls A calls A" recursion, but not "A calls
B calls A" recursion. The result is everything ends up in one function. Its not
ideal, but I've written about 4 of these things in the process of trying to get
the design right, and this has actually proven to be the most pleasent
iteration. All the bugs found in testing were minor and very obvious to fix.

One aside: trying to get this on launchpad I get:

bzr: ERROR:
CHKInventoryRepository('lp-64613584:///~scott/libnih/trunk/.bzr/repository')
is not compatible with
KnitPackRepository('lp-64613584:///~cjdahlin/libnih/nih-parse/.bzr/repository')
different serializers

What's the fix for that?

--CJD

=== modified file 'nih/Makefile.am'
--- nih/Makefile.am	2009-11-21 21:44:23 +0000
+++ nih/Makefile.am	2010-02-10 05:41:20 +0000
@@ -25,6 +25,7 @@
 	command.c \
 	config.c \
 	logging.c \
+	parse.c \
 	error.c
 
 libnih_la_LDFLAGS = \
@@ -58,6 +59,7 @@
 	command.h \
 	config.h \
 	logging.h \
+	parse.h \
 	error.h \
 	errors.h \
 	test.h \
@@ -95,10 +97,15 @@
 	test_command \
 	test_config \
 	test_logging \
+	test_parse \
 	test_error
 
 check_PROGRAMS = $(TESTS)
 
+test_parse_SOURCES = tests/test_parse.c
+test_parse_LDFLAGS = -static
+test_parse_LDADD = libnih.la
+
 test_alloc_SOURCES = tests/test_alloc.c
 test_alloc_LDFLAGS = -static
 test_alloc_LDADD = libnih.la

=== modified file 'nih/errors.h'
--- nih/errors.h	2009-06-23 09:29:37 +0000
+++ nih/errors.h	2010-02-05 14:32:30 +0000
@@ -42,6 +42,8 @@
 
 	NIH_DIR_LOOP_DETECTED,
 
+	NIH_PARSE_FAILED,
+
 	/* 0x20000 thru 0x2FFFF reserved for applications */
 	NIH_ERROR_APPLICATION_START = 0x20000L,
 
@@ -62,4 +64,6 @@
 
 #define NIH_DIR_LOOP_DETECTED_STR          N_("Directory loop detected")
 
+#define NIH_PARSE_FAILED_STR		   N_("Expected %s")
+
 #endif /* NIH_ERRORS_H */

=== added file 'nih/parse.c'
--- nih/parse.c	1970-01-01 00:00:00 +0000
+++ nih/parse.c	2010-02-10 05:38:28 +0000
@@ -0,0 +1,437 @@
+/* libnih
+ *
+ * parse.c - Parsing expression grammar parser
+ *
+ * Copyright © 2010 Casey Dahlin <cdahlin at redhat.com>
+ * Copyright © 2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <string.h>
+
+#include <nih/alloc.h>
+#include <nih/logging.h>
+#include <nih/list.h>
+#include <nih/error.h>
+#include <nih/errors.h>
+
+#include <nih/parse.h>
+
+
+/**
+ * NIH_PARSE_CHARSET_MATCH:
+ * @set: Set to match against,
+ * @c: char to match.
+ *
+ * See if @c is in @set.
+ *
+ * Returns: TRUE or FALSE.
+ **/
+#define NIH_PARSE_CHARSET_MATCH(set, c) ({			\
+	!!((set)[(c)/64] & ((uint64_t)1 << ((c) % 64)));	\
+})
+
+
+/**
+ * XOR:
+ * @a: first condition,
+ * @b: second condition.
+ *
+ * True if EXACTLY ONE of @a and @b is true.
+ **/
+#define XOR(a,b) ((!(a) && (b)) || (!(b) && (a)))
+
+
+/* Prototypes for static declarations. */
+static NihParseNode *nih_parse_node_new (NihParseNode *parent,
+					 NihParseSymbol *symbol,
+					 const char *start, size_t len,
+					 size_t nth_match,
+					 NihParseSymbolChildRef *parent_ref)
+	__attribute__ ((warn_unused_result, malloc));
+
+static NihParseNode *nih_parse_terminal_symbol_matches (void *parent,
+							NihParseSymbol *symbol,
+							const char *text_pos,
+							size_t text_len,
+							int *oom_flag,
+							size_t nth_time,
+					NihParseSymbolChildRef *parent_ref)
+	__attribute__ ((warn_unused_result));
+
+
+/**
+ * nih_parse_symbol_matches:
+ * @alloc_parent: Alloc parent,
+ * @symbol: symbol to match,
+ * @text_pos: position in the text stream,
+ * @text_len: characters remaining.
+ *
+ * Determines if @symbol matches at @text_pos and returns a parse tree
+ * describing its correspondance.
+ *
+ * Returns: A new NihParseNode or NULL on no match or OOM.
+ **/
+NihParseNode *
+nih_parse_symbol_matches (void           *alloc_parent,
+			  NihParseSymbol *symbol,
+			  const char     *text_pos,
+			  size_t          text_len)
+{
+#define RECURSIVE_LOCALS			\
+	size_t                  iter;		\
+	NihParseSymbolChildRef *child;		\
+	NihParseSymbolChildRef *parent_ref;	\
+	NihParseNode           *ret;		\
+	const char             *old_pos;	\
+	size_t                  old_len;	\
+	size_t                  times;		\
+	int                     alt_matched;	\
+	size_t                  nth_time;       \
+	int                     success;	\
+	NihParseNode           *tail;		\
+	NihParseNode           *parent
+
+	RECURSIVE_LOCALS;
+	NihParseNode           *got_returned;
+	const char             *text_end = text_pos + text_len;
+
+	parent = NULL;
+	parent_ref = NULL;
+	ret = NULL;
+	nth_time = 1;
+
+	struct frame {
+		struct frame           *prev;
+		void                   *retpoint;
+
+		NihParseSymbol         *symbol;
+
+		RECURSIVE_LOCALS;
+	} *frame = NULL;
+
+#define DO_OOM_ABORT() ({				\
+	while (frame) {					\
+		struct frame *to_free = frame;		\
+		frame = frame->prev;			\
+		if (to_free->ret)			\
+			nih_free (to_free->ret);	\
+		nih_free (to_free);			\
+	}						\
+							\
+	nih_error_raise_no_memory ();			\
+	if (ret)					\
+		nih_free (ret);				\
+							\
+	return NULL;					\
+})
+
+#define DO_CALL(parent_new, symbol_new, ret_new, nth_new, parent_ref_new) ({ \
+	__label__ retpoint;					\
+	struct frame *next = nih_new (NULL, struct frame);	\
+	struct frame *my_prev;					\
+								\
+	if (! next)						\
+		DO_OOM_ABORT ();				\
+								\
+	next->prev = frame;					\
+	frame = next;						\
+								\
+	frame->iter = iter;					\
+	frame->child = child;					\
+	frame->ret = ret;					\
+	frame->old_pos = old_pos;				\
+	frame->old_len = old_len;				\
+	frame->times = times;					\
+	frame->parent = parent;					\
+	frame->parent_ref = parent_ref;				\
+	frame->symbol = symbol;					\
+	frame->success = success;				\
+	frame->tail = tail;					\
+	frame->alt_matched = alt_matched;			\
+	frame->retpoint = &&retpoint;				\
+								\
+	parent = (parent_new);					\
+	parent_ref = (parent_ref_new);				\
+	symbol = (symbol_new);					\
+	ret = (ret_new);					\
+	nth_time = (nth_new);					\
+								\
+	goto begin;						\
+								\
+retpoint:							\
+	my_prev = frame->prev;					\
+	nih_free (frame);					\
+	frame = my_prev;					\
+								\
+	(got_returned);						\
+})
+
+#define DO_RETURN(x) ({							\
+	NihParseNode *my_ret = (x);					\
+									\
+	if (! frame) {							\
+		if (! my_ret)						\
+			nih_error_raise_printf (NIH_PARSE_FAILED,	\
+					 _(NIH_PARSE_FAILED_STR),	\
+					 "symbol");			\
+		else if (alloc_parent)					\
+			nih_ref (my_ret, alloc_parent);			\
+									\
+		return my_ret;						\
+	}								\
+									\
+	got_returned = my_ret;						\
+	iter = frame->iter;						\
+	child = frame->child;						\
+	ret = frame->ret;						\
+	old_pos = frame->old_pos;					\
+	old_len = frame->old_len;					\
+	times = frame->times;						\
+	parent = frame->parent;						\
+	parent_ref = frame->parent_ref;					\
+	symbol = frame->symbol;						\
+	tail = frame->tail;						\
+	success = frame->success;					\
+	alt_matched = frame->alt_matched;				\
+	goto *frame->retpoint;						\
+})
+
+begin:
+	nih_assert (text_pos != NULL);
+	nih_assert (symbol != NULL);
+
+	if (symbol->type != NIH_PARSE_NONTERM) {
+		int oom_flag = 0;
+
+		if (ret) {
+			nih_free (ret);
+			DO_RETURN (NULL);
+		}
+
+		NihParseNode *term =
+			nih_parse_terminal_symbol_matches (parent, symbol,
+							   text_pos,
+							   text_end - text_pos,
+							   &oom_flag,
+							   nth_time,
+							   parent_ref);
+
+		if (term)
+			text_pos += symbol->length;
+		else if (oom_flag)
+			DO_OOM_ABORT ();
+
+		DO_RETURN (term);
+	}
+
+	alt_matched = 0;
+
+	if (! ret) {
+		ret = nih_parse_node_new (parent, symbol, text_pos, 0,
+					  nth_time, parent_ref);
+	} else {
+		iter = symbol->length - 1;
+		child = &symbol->children[iter];
+		goto backtrack;
+	}
+
+	if (! ret)
+		DO_OOM_ABORT ();
+
+	for (iter = 0; iter < symbol->length; iter++) {
+		child = &symbol->children[iter];
+		old_pos = text_pos;
+
+		if (alt_matched) {
+			alt_matched = child->flags & NIH_PARSE_ALT_WITH_NEXT;
+			continue;
+		}
+
+		for (times = 0; times < child->max_times || ! child->max_times;
+		     times++) {
+			if (! DO_CALL (ret, child->child, NULL, times + 1,
+				       child))
+				break;
+continuation:		continue; /* need something to put the label on */
+		}
+
+		alt_matched = child->flags & NIH_PARSE_ALT_WITH_NEXT;
+
+		if (child->flags & NIH_PARSE_NOT ||
+		    times < child->min_times ||
+		    child->flags & NIH_PARSE_LOOKAHEAD)
+			text_pos = old_pos;
+
+		if (! XOR (times < child->min_times,
+			   child->flags & NIH_PARSE_NOT))
+			continue;
+
+backtrack:
+		alt_matched = 0;
+		success = 0;
+
+		if (child->flags & NIH_PARSE_ALT_WITH_NEXT)
+			continue;
+
+		while (! NIH_LIST_EMPTY (&ret->children)) {
+			tail = (NihParseNode *)ret->children.prev;
+
+			while (tail->parent_ref != child) {
+				iter--;
+				child = &symbol->children[iter];
+			}
+
+			if (child->flags &
+			    (NIH_PARSE_NOT | NIH_PARSE_LOOKAHEAD)) {
+				nih_free (tail);
+				continue;
+			}
+
+			if (tail->nth_match > child->min_times
+			    || child->flags & NIH_PARSE_ALT_WITH_NEXT)
+				success = 1;
+
+			if (DO_CALL (ret, tail->symbol, tail,
+				     tail->nth_match, child)) {
+				times = tail->nth_match - 1;
+				goto continuation;
+			}
+
+			if (success)
+				break;
+		}
+
+		if (! success) {
+			nih_free (ret);
+			DO_RETURN (NULL);
+		}
+
+		if (! NIH_LIST_EMPTY (&ret->children)) {
+			tail = (NihParseNode *)ret->children.prev;
+			text_pos = tail->start + tail->len;
+		} else {
+			text_pos = ret->start;
+		}
+	}
+
+	ret->len = text_pos - ret->start;
+	DO_RETURN (ret);
+
+#undef DO_OOM_ABORT
+#undef DO_CALL
+#undef DO_RETURN
+}
+
+/**
+ * nih_parse_terminal_symbol_matches:
+ * @parent: Alloc parent,
+ * @symbol: symbol to check,
+ * @text_pos: position in text to check,
+ * @text_len: length of text after @text_pos,
+ * @oom_flag: int to set to 1 if an OOM error occurs,
+ * @nth_time: this is the nth time this symbol has been matched,
+ * @parent_ref: reference from parent node to the node to be created.
+ *
+ * Check to see if @symbol matches at @text_pos, where @symbol is a terminal
+ * symbol.
+ *
+ * Returns: A new parse node or NULL on no match or OOM.
+ **/
+static NihParseNode *
+nih_parse_terminal_symbol_matches (void                   *parent,
+				   NihParseSymbol         *symbol,
+				   const char             *text_pos,
+				   size_t                  text_len,
+				   int                    *oom_flag,
+				   size_t                  nth_time,
+				   NihParseSymbolChildRef *parent_ref)
+{
+	NihParseNode *ret;
+
+	nih_assert (symbol != NULL);
+	nih_assert (symbol->type != NIH_PARSE_NONTERM);
+	nih_assert (text_pos != NULL);
+	nih_assert (oom_flag != NULL);
+
+	*oom_flag = 0;
+
+	if (text_len < symbol->length)
+		return NULL;
+
+	if (symbol->type == NIH_PARSE_CHARSET &&
+	    ! NIH_PARSE_CHARSET_MATCH (symbol->charset, *text_pos))
+		return NULL;
+
+	if (symbol->type == NIH_PARSE_STRING &&
+	    strncmp (symbol->text, text_pos, symbol->length))
+		return NULL;
+
+	ret = nih_parse_node_new (parent, symbol, text_pos,
+				  symbol->length, nth_time, parent_ref);
+
+	if (! ret)
+		*oom_flag = 1;
+
+	return ret;
+}
+
+/**
+ * nih_parse_node_new:
+ * @parent: Alloc parent,
+ * @symbol: Symbol matched,
+ * @start: First character if this is a terminal match,
+ * @len: Length if this is a terminal match,
+ * @nth_match: This is the nth consectutive match of @symbol,
+ * @parent_ref: Reference from parent symbol to this symbol.
+ *
+ * Create a new NihParseNode.
+ * 
+ * Returns: The new node or NULL on OOM.
+ **/
+static NihParseNode *
+nih_parse_node_new (NihParseNode           *parent,
+		    NihParseSymbol         *symbol,
+		    const char             *start,
+		    size_t                  len,
+		    size_t                  nth_match,
+		    NihParseSymbolChildRef *parent_ref)
+{
+	NihParseNode *ret = nih_new (parent, NihParseNode);
+
+	nih_assert (symbol != NULL);
+
+	if (! ret)
+		return NULL;
+
+	nih_list_init (&ret->node);
+	nih_list_init (&ret->children);
+	nih_alloc_set_destructor (ret, nih_list_destroy);
+
+	if (parent)
+		nih_list_add (&parent->children, &ret->node);
+
+	ret->start = start;
+	ret->symbol = symbol;
+	ret->len = len;
+	ret->parent_ref = parent_ref;
+	ret->nth_match = nth_match;
+
+	return ret;
+}

=== added file 'nih/parse.h'
--- nih/parse.h	1970-01-01 00:00:00 +0000
+++ nih/parse.h	2010-02-09 23:42:39 +0000
@@ -0,0 +1,121 @@
+/* libnih
+ *
+ * Copyright © 2010 Casey Dahlin <cdahlin at redhat.com>
+ * Copyright © 2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef NIH_PARSE_H
+#define NIH_PARSE_H
+
+#include <sys/types.h>
+
+#include <nih/list.h>
+
+/**
+ * NihParseSymbolRefFlags:
+ *
+ * Flags describing properties of a symbol parent-child relationship.
+ *
+ * NIH_PARSE_ALT_WITH_NEXT: Either this child OR the next should match.
+ *
+ * NIH_PARSE_LOOKAHEAD: Start the next child matching at the same position as
+ * this one.
+ *
+ * NIH_PARSE_NOT: This child must NOT match.
+ **/
+typedef unsigned int NihParseSymbolRefFlags;
+#define NIH_PARSE_ALT_WITH_NEXT 0x1
+#define NIH_PARSE_LOOKAHEAD     0x2
+#define NIH_PARSE_NOT           0x4
+
+/**
+ * NihParseSymbolChildRef:
+ * @child: The child symbol,
+ * @min_times: Number of times @child must match,
+ * @max_times: Number of times @child may match (0 for infinite),
+ * @flags: Properties of this relationship.
+ *
+ * A parent-child relationship between symbols.
+ **/
+struct nih_parse_symbol;
+typedef struct nih_parse_symbol_child_ref {
+	struct nih_parse_symbol *child;
+	size_t min_times;
+	size_t max_times;
+	NihParseSymbolRefFlags flags;
+} NihParseSymbolChildRef;
+
+/**
+ * NihParseSymbol:
+ * @type: type of symbol,
+ * 	NIH_PARSE_STRING: A literal string to match,
+ * 	NIH_PARSE_CHARSET: A set of characters from which to match one,
+ * 	NIH_PARSE_NONTERM: A set of other symbols to match,
+ * @text: text to match (NIH_PARSE_STRING only),
+ * @length: length of @text or @children (Always 1 for NIH_PARSE_CHARSET),
+ * @charset: characters to match as a bitfield (NIH_PARSE_CHARSET only),
+ * @children: child symbols (NIH_PARSE_NONTERM only),
+ * 
+ * A syntactical unit of a PEG grammar.
+ **/
+typedef struct nih_parse_symbol {
+	enum {
+		NIH_PARSE_STRING,
+		NIH_PARSE_CHARSET,
+		NIH_PARSE_NONTERM
+	} type;
+
+	const char                    *text;
+	NihParseSymbolChildRef        *children;
+	size_t                         length;
+	u_int64_t                      charset[4];
+} NihParseSymbol;
+
+
+/**
+ * NihParseNode:
+ * @node: list node,
+ * @symbol: symbol matched,
+ * @start: text start position,
+ * @len: text length,
+ * @nth_match: this was the nth time in a row this symbol matched,
+ * @children: child nodes.
+ * @parent_ref: Child ref from the parent symbol that led to parsing this node.
+ *
+ * Node in a parse tree resulting from matching a string to a particular
+ * symbol.
+ **/
+typedef struct nih_parse_node {
+	NihList                 node;
+	NihParseSymbol         *symbol;
+	const char             *start;
+	size_t                  len;
+	size_t                  nth_match;
+	NihList                 children;
+	NihParseSymbolChildRef *parent_ref;
+} NihParseNode;
+
+
+NIH_BEGIN_EXTERN
+
+NihParseNode *nih_parse_symbol_matches (void *alloc_parent,
+					NihParseSymbol *symbol,
+					const char *text_pos, size_t text_len)
+	__attribute__ ((warn_unused_result, malloc));
+
+NIH_END_EXTERN
+
+#endif /* NIH_PARSE_H */

=== added file 'nih/tests/test_parse.c'
--- nih/tests/test_parse.c	1970-01-01 00:00:00 +0000
+++ nih/tests/test_parse.c	2010-02-10 05:35:38 +0000
@@ -0,0 +1,650 @@
+/* libnih
+ *
+ * test_parse.c - test suite for nih/parse.c
+ *
+ * Copyright © 2010 Casey Dahlin <cdahlin at redhat.com>
+ * Copyright © 2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <nih/test.h>
+
+#include <string.h>
+
+#include <nih/macros.h>
+#include <nih/parse.h>
+#include <nih/alloc.h>
+#include <nih/error.h>
+#include <nih/errors.h>
+
+NihParseSymbol otoro = {
+	.type = NIH_PARSE_STRING,
+	.text = "otoro",
+	.length = 5,
+};
+
+NihParseSymbol sake = {
+	.type = NIH_PARSE_STRING,
+	.text = "sake",
+	.length = 4,
+};
+
+NihParseSymbol sawara = {
+	.type = NIH_PARSE_STRING,
+	.text = "sawara",
+	.length = 6,
+};
+
+NihParseSymbol hotate = {
+	.type = NIH_PARSE_STRING,
+	.text = "hotate",
+	.length = 6,
+};
+
+NihParseSymbol maki = {
+	.type = NIH_PARSE_STRING,
+	.text = "-maki",
+	.length = 5,
+};
+
+NihParseSymbol nigri = {
+	.type = NIH_PARSE_STRING,
+	.text = "-nigri",
+	.length = 6,
+};
+
+NihParseSymbol nl = {
+	.type = NIH_PARSE_STRING,
+	.text = "\n",
+	.length = 1,
+};
+
+NihParseSymbolChildRef fish_children[] = {
+	{ &otoro, 1, 1, NIH_PARSE_ALT_WITH_NEXT },
+	{ &sake, 1, 1, NIH_PARSE_ALT_WITH_NEXT },
+	{ &sawara, 1, 1, NIH_PARSE_ALT_WITH_NEXT },
+	{ &hotate, 1, 1, 0 },
+};
+NihParseSymbol fish = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 4,
+	.children = fish_children,
+};
+
+NihParseSymbolChildRef sushi_type_children[] = {
+	{ &maki, 1, 1, NIH_PARSE_ALT_WITH_NEXT },
+	{ &nigri, 1, 1, 0 },
+};
+NihParseSymbol sushi_type = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sushi_type_children,
+};
+
+NihParseSymbolChildRef sushi_children[] = {
+	{ &fish, 1, 1, 0 },
+	{ &sushi_type, 1, 1, 0 },
+};
+NihParseSymbol sushi = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sushi_children,
+};
+
+NihParseSymbolChildRef sushi_menu_tail_children[] = {
+	{ &nl, 1, 1, 0 },
+	{ &sushi, 1, 1, 0 },
+};
+NihParseSymbol sushi_menu_tail = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sushi_menu_tail_children,
+};
+
+NihParseSymbolChildRef sushi_menu_children[] = {
+	{ &sushi, 1, 1, 0 },
+	{ &sushi_menu_tail, 0, 0, 0 },
+};
+NihParseSymbol sushi_menu = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sushi_menu_children,
+};
+
+NihParseSymbol vowel = {
+	.type = NIH_PARSE_CHARSET,
+	.length = 1,
+	.charset = { 0, 0x20822200000000, 0, 0 }, /* [aeiou] */
+};
+
+NihParseSymbol sawar = {
+	.type = NIH_PARSE_STRING,
+	.text = "sawar",
+	.length = 5,
+};
+
+NihParseSymbolChildRef sawar__children[] = {
+	{ &sawar, 1, 1, 0 },
+	{ &vowel, 1, 1, 0 },
+};
+NihParseSymbol sawar_ = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sawar__children,
+};
+
+NihParseSymbolChildRef sawar_and_a_children[] = {
+	{ &sawara, 1, 1, NIH_PARSE_LOOKAHEAD },
+	{ &sawar_, 1, 1, 0 },
+};
+NihParseSymbol sawar_and_a = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sawar_and_a_children,
+};
+
+NihParseSymbolChildRef sawar_not_a_children[] = {
+	{ &sawara, 1, 1, NIH_PARSE_NOT },
+	{ &sawar_, 1, 1, 0 },
+};
+NihParseSymbol sawar_not_a = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = sawar_not_a_children,
+};
+
+NihParseSymbolChildRef vowels_and_a_vowel_children[] = {
+	{ &vowel, 0, 0, 0 },
+	{ &vowel, 1, 1, 0 },
+};
+NihParseSymbol vowels_and_a_vowel = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = vowels_and_a_vowel_children,
+};
+
+NihParseSymbolChildRef vowels_3_to_6_children[] = {
+	{ &vowel, 3, 6, 0 },
+};
+NihParseSymbol vowels_3_to_6 = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 1,
+	.children = vowels_3_to_6_children,
+};
+
+NihParseSymbol abcab = {
+	.type = NIH_PARSE_STRING,
+	.text = "abcab",
+	.length = 5,
+};
+
+NihParseSymbol abc = {
+	.type = NIH_PARSE_STRING,
+	.text = "abc",
+	.length = 3,
+};
+
+NihParseSymbolChildRef abcab_or_abc_children[] = {
+	{ &abcab, 1, 1, NIH_PARSE_ALT_WITH_NEXT },
+	{ &abc, 1, 1, 0 },
+};
+NihParseSymbol abcab_or_abc = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 2,
+	.children = abcab_or_abc_children,
+};
+
+NihParseSymbolChildRef abcab_or_abc_repeat_children[] = {
+	{ &abcab_or_abc, 2, 0, 0 },
+};
+NihParseSymbol abcab_or_abc_repeat = {
+	.type = NIH_PARSE_NONTERM,
+	.length = 1,
+	.children = abcab_or_abc_repeat_children,
+};
+#define TEST_PARSE_FAIL(...) ({						\
+	NihError *error;						\
+	nih_error_push_context ();					\
+									\
+	TEST_EQ_P (nih_parse_symbol_matches (__VA_ARGS__), NULL);	\
+									\
+	error = nih_error_get ();					\
+									\
+	TEST_EQ (error->number, NIH_PARSE_FAILED);			\
+	TEST_EQ_STR (error->message, "Expected symbol");		\
+	TEST_ALLOC_PARENT (error->message, error);			\
+									\
+	nih_free (error);						\
+	nih_error_pop_context ();					\
+})
+
+#define TEST_NODE(sym, text, times)		\
+	TEST_NE_P (result, NULL);		\
+	TEST_EQ_P (result->symbol, (sym));	\
+	TEST_EQ_STRN (result->start, (text));	\
+	TEST_EQ (result->len, strlen (text));	\
+	TEST_EQ (result->nth_match, (times))
+
+#define TEST_PARSE_SYMBOL_MUST_MATCH(...) ({				\
+	NihParseNode *result = nih_parse_symbol_matches (__VA_ARGS__);	\
+	TEST_NE_P (result, NULL);					\
+	(result);							\
+})
+
+#define TEST_PARSE(...)							\
+	for (NihParseNode *result =					\
+	     TEST_PARSE_SYMBOL_MUST_MATCH (__VA_ARGS__); result;	\
+	     nih_free (result), result = NULL)
+
+#define TEST_CHILD_COUNT(n) ({				\
+	int count = 0;					\
+							\
+	NIH_LIST_FOREACH (&result->children, iter)	\
+		count++;				\
+							\
+	TEST_EQ (count, (n));				\
+})
+
+#define TEST_GET_CHILD(n) ({				\
+	int count = 0;					\
+	NihParseNode *child;				\
+							\
+	NIH_LIST_FOREACH (&result->children, iter) {	\
+		child = (NihParseNode *)iter;		\
+		count++;				\
+		if (count == (n))			\
+			break;				\
+	}						\
+							\
+	TEST_EQ (count, (n));				\
+	(child);					\
+})
+
+/* Some trickiness here to make a new result variable in the new scope with a
+ * value defined in terms of the old result variable from the parent scope.
+ */
+#define TEST_CHILD_NODE(n)				\
+	for (NihParseNode *child = TEST_GET_CHILD(n),	\
+	     *result = child;				\
+	     result;					\
+	     result = NULL)
+
+void
+test_symbol_matches (void)
+{
+	char *string;
+
+	TEST_FUNCTION ("nih_parse_symbol_matches");
+
+
+	TEST_FEATURE ("with trivial string pattern");
+	TEST_PARSE (NULL, &otoro, "otoro", 5) {
+		TEST_NODE (&otoro, "otoro", 1);
+		TEST_CHILD_COUNT (0);
+	}
+
+
+	TEST_FEATURE ("with trivial string pattern and no match");
+	TEST_PARSE_FAIL (NULL, &otoro, "otter", 5);
+
+
+	TEST_FEATURE ("with alternation group");
+	TEST_PARSE (NULL, &fish, "sawara", 6) {
+		TEST_NODE (&fish, "sawara", 1);
+		TEST_CHILD_COUNT (1);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&sawara, "sawara", 1);
+			TEST_CHILD_COUNT (0);
+		}
+	}
+
+
+	TEST_FEATURE ("with alternation group and no match");
+	TEST_PARSE_FAIL (NULL, &fish, "sawhorse", 8);
+
+
+	TEST_FEATURE ("with multiple non-terminal layers");
+	TEST_PARSE (NULL, &sushi, "sake-nigri", 10) {
+		TEST_NODE (&sushi, "sake-nigri", 1);
+		TEST_CHILD_COUNT (2);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&fish, "sake", 1);
+			TEST_CHILD_COUNT (1);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&sake, "sake", 1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+
+		TEST_CHILD_NODE (2) {
+			TEST_NODE (&sushi_type, "-nigri", 1);
+			TEST_CHILD_COUNT (1);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&nigri, "-nigri", 1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with multiple non-terminal layers and no match");
+	TEST_PARSE_FAIL (NULL, &sushi, "sake-negima", 11);
+
+
+	string = "sake-nigri\n"
+		 "sawara-maki\n"
+		 "hotate-nigri";
+
+	TEST_FEATURE ("with repetition");
+	TEST_PARSE (NULL, &sushi_menu, string, 35) {
+		TEST_NODE (&sushi_menu, string, 1);
+		TEST_CHILD_COUNT (3);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&sushi, "sake-nigri", 1);
+			TEST_CHILD_COUNT (2);
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&fish, "sake", 1);
+				TEST_CHILD_COUNT (1);
+
+				TEST_CHILD_NODE (1) {
+					TEST_NODE (&sake, "sake", 1);
+					TEST_CHILD_COUNT (0);
+				}
+			}
+
+			TEST_CHILD_NODE (2) {
+				TEST_NODE (&sushi_type, "-nigri", 1);
+				TEST_CHILD_COUNT (1);
+
+				TEST_CHILD_NODE (1) {
+					TEST_NODE (&nigri, "-nigri", 1);
+					TEST_CHILD_COUNT (0);
+				}
+			}
+		}
+
+		TEST_CHILD_NODE (2) {
+			TEST_NODE (&sushi_menu_tail, "\nsawara-maki", 1);
+			TEST_CHILD_COUNT (2);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&nl, "\n", 1);
+				TEST_CHILD_COUNT (0);
+			}
+
+			TEST_CHILD_NODE (2) {
+				TEST_NODE (&sushi, "sawara-maki", 1);
+				TEST_CHILD_COUNT (2);
+
+				TEST_CHILD_NODE (1) {
+					TEST_NODE (&fish, "sawara", 1);
+					TEST_CHILD_COUNT (1);
+
+					TEST_CHILD_NODE (1) {
+						TEST_NODE (&sawara, "sawara", 1);
+						TEST_CHILD_COUNT (0);
+					}
+				}
+
+				TEST_CHILD_NODE (2) {
+					TEST_NODE (&sushi_type, "-maki", 1);
+					TEST_CHILD_COUNT (1);
+
+					TEST_CHILD_NODE (1) {
+						TEST_NODE (&maki, "-maki", 1);
+						TEST_CHILD_COUNT (0);
+					}
+				}
+			}
+		}
+
+		TEST_CHILD_NODE (3) {
+			TEST_NODE (&sushi_menu_tail, "\nhotate-nigri", 2);
+			TEST_CHILD_COUNT (2);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&nl, "\n", 1);
+				TEST_CHILD_COUNT (0);
+			}
+
+			TEST_CHILD_NODE (2) {
+				TEST_NODE (&sushi, "hotate-nigri", 1);
+				TEST_CHILD_COUNT (2);
+
+				TEST_CHILD_NODE (1) {
+					TEST_NODE (&fish, "hotate", 1);
+					TEST_CHILD_COUNT (1);
+
+					TEST_CHILD_NODE (1) {
+						TEST_NODE (&hotate, "hotate", 1);
+						TEST_CHILD_COUNT (0);
+					}
+				}
+
+				TEST_CHILD_NODE (2) {
+					TEST_NODE (&sushi_type, "-nigri", 1);
+					TEST_CHILD_COUNT (1);
+
+					TEST_CHILD_NODE (1) {
+						TEST_NODE (&nigri, "-nigri", 1);
+						TEST_CHILD_COUNT (0);
+					}
+				}
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with charset (1)");
+	TEST_PARSE (NULL, &vowel, "a", 1) {
+		TEST_NODE (&vowel, "a", 1);
+		TEST_CHILD_COUNT (0);
+	}
+
+
+	TEST_FEATURE ("with charset (2)");
+	TEST_PARSE (NULL, &vowel, "i", 1) {
+		TEST_NODE (&vowel, "i", 1);
+		TEST_CHILD_COUNT (0);
+	}
+
+
+	TEST_FEATURE ("with charset and no match");
+	TEST_PARSE_FAIL (NULL, &vowel, "q", 1);
+
+
+	TEST_FEATURE ("with look-ahead");
+	TEST_PARSE (NULL, &sawar_and_a, "sawara", 6) {
+		TEST_NODE (&sawar_and_a, "sawara", 1);
+		TEST_CHILD_COUNT (2);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&sawara, "sawara", 1);
+			TEST_CHILD_COUNT (0);
+		}
+
+		TEST_CHILD_NODE (2) {
+			TEST_NODE (&sawar_, "sawara", 1);
+			TEST_CHILD_COUNT (2);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&sawar, "sawar", 1);
+				TEST_CHILD_COUNT (0);
+			}
+
+			TEST_CHILD_NODE (2) {
+				TEST_NODE (&vowel, "a", 1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with look-ahead and no match");
+	TEST_PARSE_FAIL (NULL, &sawar_and_a, "saware", 6);
+
+
+	TEST_FEATURE ("with negation");
+	TEST_PARSE (NULL, &sawar_not_a, "saware", 6) {
+		TEST_NODE (&sawar_not_a, "saware", 1);
+		TEST_CHILD_COUNT (1);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&sawar_, "saware", 1);
+			TEST_CHILD_COUNT (2);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&sawar, "sawar", 1);
+				TEST_CHILD_COUNT (0);
+			}
+
+			TEST_CHILD_NODE (2) {
+				TEST_NODE (&vowel, "e", 1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with negation and no match");
+	TEST_PARSE_FAIL (NULL, &sawar_not_a, "sawara", 6);
+
+	
+	TEST_FEATURE ("with bounds");
+	TEST_PARSE (NULL, &vowels_3_to_6, "aeiou", 5) {
+		TEST_NODE (&vowels_3_to_6, "aeiou", 1);
+		TEST_CHILD_COUNT (5);
+
+		for (int i = 0; i < 5; i++) {
+			char match[] = { 'x', '\0' };
+
+			match[0] = "aeiou"[i];
+			TEST_CHILD_NODE (i+1) {
+				TEST_NODE (&vowel, match, i+1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with bounds and too few");
+	TEST_PARSE_FAIL (NULL, &vowels_3_to_6, "ae", 2);
+
+
+	TEST_FEATURE ("with bounds and just enough");
+	TEST_PARSE (NULL, &vowels_3_to_6, "aei", 3) {
+		TEST_NODE (&vowels_3_to_6, "aei", 1);
+		TEST_CHILD_COUNT (3);
+
+		for (int i = 0; i < 3; i++) {
+			char match[] = { 'x', '\0' };
+
+			match[0] = "aei"[i];
+			TEST_CHILD_NODE (i+1) {
+				TEST_NODE (&vowel, match, i+1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with bounds and too many");
+	TEST_PARSE (NULL, &vowels_3_to_6, "aeiouaeiou", 10) {
+		TEST_NODE (&vowels_3_to_6, "aeioua", 1);
+		TEST_CHILD_COUNT (6);
+
+		for (int i = 0; i < 6; i++) {
+			char match[] = { 'x', '\0' };
+
+			match[0] = "aeioua"[i];
+			TEST_CHILD_NODE (i+1) {
+				TEST_NODE (&vowel, match, i+1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+
+
+	TEST_FEATURE ("with backtrack")
+	TEST_PARSE (NULL, &vowels_and_a_vowel, "aeiou", 5) {
+		TEST_NODE (&vowels_and_a_vowel, "aeiou", 1);
+		TEST_CHILD_COUNT (5);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&vowel, "a", 1);
+			TEST_CHILD_COUNT (0);
+		}
+
+		TEST_CHILD_NODE (2) {
+			TEST_NODE (&vowel, "e", 2);
+			TEST_CHILD_COUNT (0);
+		}
+
+		TEST_CHILD_NODE (3) {
+			TEST_NODE (&vowel, "i", 3);
+			TEST_CHILD_COUNT (0);
+		}
+
+		TEST_CHILD_NODE (4) {
+			TEST_NODE (&vowel, "o", 4);
+			TEST_CHILD_COUNT (0);
+		}
+
+		TEST_CHILD_NODE (5) {
+			TEST_NODE (&vowel, "u", 1);
+			TEST_CHILD_COUNT (0);
+		}
+	}
+
+
+	TEST_FEATURE("with sub-backtrack in repetition path");
+	TEST_PARSE (NULL, &abcab_or_abc_repeat, "abcabc", 6) {
+		TEST_NODE (&abcab_or_abc_repeat, "abcabc", 1);
+		TEST_CHILD_COUNT (2);
+
+		TEST_CHILD_NODE (1) {
+			TEST_NODE (&abcab_or_abc, "abc", 1);
+			TEST_CHILD_COUNT (1);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&abc, "abc", 1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+
+		TEST_CHILD_NODE (2) {
+			TEST_NODE (&abcab_or_abc, "abc", 2);
+			TEST_CHILD_COUNT (1);
+
+			TEST_CHILD_NODE (1) {
+				TEST_NODE (&abc, "abc", 1);
+				TEST_CHILD_COUNT (0);
+			}
+		}
+	}
+}
+
+int
+main (int    argc,
+      char **argv)
+{
+	test_symbol_matches ();
+	return 0;
+}




More information about the upstart-devel mailing list