Re: [PATCH 03/12] parse: Start splitting out parsing helpers

1 Jul 2026

On Wed, Jul 01, 2026 at 02:07:12AM +0200, Stefano Brivio wrote:
...
On Fri, 26 Jun 2026 17:09:54 +1000
David Gibson  wrote:
...
As we add more complexity to what forwarding rules are allowed, our
existing ad-hoc C parsing is starting to become quite awkward.  We already
have some parts that resemble a very simple recursive[0] descent parser,
with composable subfunctions that consume as much input as they need,
rather than pre-splitting based on known delimiters.
Start extending this approach, by creating parse.[ch] with parsing helpers
with a uniform interface.  Initially we start with very simple cases: the
parse_keyword() function from fwd_rule.c and another helper to check that
we've reached the end of input.
Rename parse_keyword() to parse_literal(), because it's not just useful
for "keywords" as such.  We can use it in a bunch of additional places for
parsing delimiters and other symbols.  This doesn't gain us a lot now but
will make things clearer as we use more such parser helpers.
[0] Except that the grammars we have aren't actually recursive, so neither
    is the code.
Signed-off-by: David Gibson 
---
 Makefile   | 17 ++++++++------
 conf.c     |  5 +++-
 fwd_rule.c | 33 +++++----------------------
 parse.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 parse.h    | 14 ++++++++++++
 5 files changed, 101 insertions(+), 35 deletions(-)
 create mode 100644 parse.c
 create mode 100644 parse.h

diff --git a/Makefile b/Makefile
index 5ed0f702..bd729c75 100644
--- a/Makefile
+++ b/Makefile
@@ -36,12 +36,13 @@ BASE_CFLAGS += -pedantic -Wall -Wextra -Wno-format-zero-length -Wformat-security
 PASST_SRCS = arch.c arp.c bitmap.c checksum.c conf.c dhcp.c dhcpv6.c \
  epoll_ctl.c flow.c fwd.c fwd_rule.c icmp.c igmp.c inany.c iov.c ip.c \
  isolation.c lineread.c log.c mld.c ndp.c netlink.c migrate.c packet.c \
-	passt.c pasta.c pcap.c pif.c repair.c serialise.c tap.c tcp.c \
+	parse.c passt.c pasta.c pcap.c pif.c repair.c serialise.c tap.c tcp.c \
  tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
  vhost_user.c virtio.c vu_common.c
 QRAP_SRCS = qrap.c
 PASST_REPAIR_SRCS = passt-repair.c
-PESTO_SRCS = pesto.c bitmap.c fwd_rule.c inany.c ip.c lineread.c serialise.c
+PESTO_SRCS = pesto.c bitmap.c fwd_rule.c inany.c ip.c lineread.c parse.c \
+	serialise.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) $(PESTO_SRCS)
MANPAGES = passt.1 pasta.1 pesto.1 qrap.1 passt-repair.1
@@ -49,13 +50,14 @@ MANPAGES = passt.1 pasta.1 pesto.1 qrap.1 passt-repair.1
 PASST_HEADERS = arch.h arp.h bitmap.h checksum.h conf.h dhcp.h dhcpv6.h \
  epoll_ctl.h flow.h fwd.h fwd_rule.h flow_table.h icmp.h icmp_flow.h \
  inany.h iov.h ip.h isolation.h lineread.h log.h migrate.h ndp.h \
-	netlink.h packet.h passt.h pasta.h pcap.h pif.h repair.h serialise.h \
-	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
-	tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
-	virtio.h vu_common.h
+	netlink.h packet.h parse.h passt.h pasta.h pcap.h pif.h repair.h \
+	serialise.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
+	tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
+	vhost_user.h virtio.h vu_common.h
 QRAP_HEADERS = arp.h ip.h passt.h util.h
 PASST_REPAIR_HEADERS = linux_dep.h
-PESTO_HEADERS = bitmap.h common.h fwd_rule.h inany.h ip.h log.h pesto.h serialise.h
+PESTO_HEADERS = bitmap.h common.h fwd_rule.h inany.h ip.h log.h parse.h \
+	pesto.h serialise.h
C := \#include \nint main(){int a=getrandom(0, 0, 0);}
 ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0)
@@ -223,6 +225,7 @@ pesto.cppcheck: CPPCHECK_FLAGS += --suppress=unusedFunction:bitmap.c
 pesto.cppcheck: CPPCHECK_FLAGS += --suppress=unusedFunction:inany.h
 pesto.cppcheck: CPPCHECK_FLAGS += --suppress=unusedFunction:inany.c
 pesto.cppcheck: CPPCHECK_FLAGS += --suppress=unusedFunction:ip.h
+pesto.cppcheck: CPPCHECK_FLAGS += --suppress=unusedFunction:parse.c
 pesto.cppcheck: CPPCHECK_FLAGS += --suppress=unusedFunction:serialise.c
 pesto.cppcheck: CPPCHECK_FLAGS += --suppress=staticFunction:fwd_rule.c
 pesto.cppcheck: $(PESTO_SRCS) $(PESTO_HEADERS) seccomp_pesto.h
diff --git a/conf.c b/conf.c
index 6ab8efec..ca6c859c 100644
--- a/conf.c
+++ b/conf.c
@@ -52,6 +52,7 @@
 #include "conf.h"
 #include "pesto.h"
 #include "serialise.h"
+#include "parse.h"
#define NETNS_RUN_DIR	"/run/netns"
@@ -1028,7 +1029,9 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid)
 static void conf_nat(const char *arg, struct in_addr *addr4,
           struct in6_addr *addr6, int *no_map_gw)
 {
-	if (strcmp(arg, "none") == 0) {
+	const char *p = arg;
+
+	if (parse_literal(&p, "none") && parse_eoi(p)) {
I think that !*p would be so much clearer here compared to:
- parse_eoi(p)... must be "input"?
- [takes a look at parse_eoi()] yes, it's input, great [goes back to
  here]. Ah, wait, does it return false if *p is NULL? Or true?
Like all the parse_*() functions, it returns true if it *did* see the
thing it was looking for, in this case end of string.
...
- [takes a second look at parse_eoi()] good, it returns true, so this
  is *!p...
I understand it's somewhat consistent with parse_literal() this way, so
I'm not strongly against it, just a slight preference overall.
Right, that's the reason for it.  Because later on composing these
parse helpers has a bit of its own style, it seemed helpful to
continue that style, rather than poking directly into the string.
...
...
*addr4 = in4addr_any;
      *addr6 = in6addr_any;
      if (no_map_gw)
diff --git a/fwd_rule.c b/fwd_rule.c
index 04a0101d..d0e90402 100644
--- a/fwd_rule.c
+++ b/fwd_rule.c
@@ -24,6 +24,7 @@
 #include "fwd_rule.h"
 #include "lineread.h"
 #include "log.h"
+#include "parse.h"
 #include "serialise.h"
/* Ephemeral port range: values from RFC 6335 */
@@ -427,28 +428,6 @@ static int parse_port_range(const char *s, const char **endptr,
  return 0;
 }
-/**
- * parse_keyword() - Parse a literal keyword
- * @s:		String to parse
- * @endptr:	Update to the character after the keyword
- * @kw:		Keyword to accept
- *
- * Return: 0, if @s starts with @kw, -EINVAL if it does not
- */
-static int parse_keyword(const char *s, const char **endptr, const char *kw)
-{
-	size_t len = strlen(kw);
-
-	if (strlen(s) < len)
-		return -EINVAL;
-
-	if (memcmp(s, kw, len))
-		return -EINVAL;
-
-	*endptr = s + len;
-	return 0;
-}
-
 /**
  * fwd_rule_range_except() - Set up forwarding for a range of ports minus a
  *                           bitmap of exclusions
@@ -568,7 +547,7 @@ static void fwd_rule_parse_ports(struct fwd_table *fwd, bool del, uint8_t proto,
      	continue;
      }
-		if (parse_keyword(p, &p, "auto") == 0) {
+		if (parse_literal(&p, "auto")) {
      	if (p != ep) /* Garbage after the keyword */
      		goto bad;
@@ -582,9 +561,8 @@ static void fwd_rule_parse_ports(struct fwd_table *fwd, bool del, uint8_t proto,
      }
/* Should be an exclude range */
-		if (*p != '~')
+		if (!parse_literal(&p, "~"))
      	goto bad;
-		p++;
if (parse_port_range(p, &p, &xrange))
      	goto bad;
@@ -616,8 +594,9 @@ static void fwd_rule_parse_ports(struct fwd_table *fwd, bool del, uint8_t proto,
      if (parse_port_range(p, &p, &orig_range))
      	goto bad;
-		if (*p == ':') { /* There's a range to map to as well */
-			if (parse_port_range(p + 1, &p, &mapped_range))
+		if (parse_literal(&p, ":")) {
+			/* There's a range to map to as well */
+			if (parse_port_range(p, &p, &mapped_range))
      		goto bad;
      	if ((mapped_range.last - mapped_range.first) !=
      	    (orig_range.last - orig_range.first))
diff --git a/parse.c b/parse.c
new file mode 100644
index 00000000..c2a92422
--- /dev/null
+++ b/parse.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ *  for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ *  for network namespace/tap device mode
+ *
+ * parse.c - Composable parsing helpers
+ *
+ * Copyright Red Hat
+ * Author: David Gibson 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "common.h"
+
+/**
+ * DOC: Theory of Operation
+ *
+ * These are a number of primitives which can be combined into parsers for
+ * moderately complex input.  For simpler composability, they have common
+ * conventions.
+ *
+ * - Functions return a bool indicating whether they successfully parsed.
As I reached 7/12 (converting parse_port_range()), this became rather
confusing.
It makes sense for parse_eoi() (because in some sense it's a
parse_is_eoi() function), but for the rest I'd expect almost anything
that doesn't have "is" or similar in its name to return 0 on success,
and a negative error code on failure (or -1).
Huh.  So an earlier draft did that, making them more similar to
strtoul() in interface.  I changed it because I was finding it
confusing.  Even knowing the convention, I found it hard to remember to read:
	if (!parse_foo() && !parse_bar())
	
as meaning we *succeeded* in parsing a foo and a bar.  That would not
be true of parse_foo() == 0, but that's bulkier.
...
I'm not sure if I'm ignoring some particular advantage for the
composability you mention above, if it's clearly better for whatever
reason so be it, but otherwise I'd suggest re-considering this.
No technical advantage, I just found it more readable.
...
I mean:
-		if (parse_port_range(p, &p, &xrange))
+		if (!parse_port_range(&p, &xrange))
...is quite telling.
It is?  I'm not sure what it's telling me.
...
...
+ * - Any specific output from the parse is as output parameters
+ * - First argument is always @cursor, a const char **.
+ * - On entry, *@cursor has the point to start parsing.
+ * - On successful exit, *@cursor is updated to the next character after the
+     parsed portion of the input
+ * - On failure, *@cursor and any output arguments are not modified
+ *
+ * For brevity the common parameters and return information are omitted from the
+ * individual function documentation comments.
The reason why we have "@c:	Execution context" all over the
place isn't so much human readability, it's rather automated usage, and
the day we manage to run Sphinx on the whole codebase:
https://bugs.passt.top/show_bug.cgi?id=35
it would be nice to have a clean output instead of having to go through
a pile of warnings.
So, actually, I'd keep @cursor "described" everywhere. We can also skip
it for the moment but I'm afraid we'll start following this example and
it will be more work later.
Yeah, that's reasonable.  Now that I'm not churning so much on the
interfaces it's not too hard to duplicate the descriptions.
...
...
+ */
+
+/**
+ * parse_literal() - Parse a specified literal string
+ * @lit:	Keyword to accept
+ */
+bool parse_literal(const char **cursor, const char *lit)
+{
+	size_t len = strlen(lit);
+
+	if (strlen(*cursor) < len || memcmp(*cursor, lit, len))
+		return false;
+
+	*cursor += len;
+	return true;
+}
+
+/**
+ * parse_eoi() - Parse end of input
+ * @cursor:	Current parse pointer
+ *
+ * Return: true if @p is at End of Input (\0), false otherwise
+ */
+bool parse_eoi(const char *cursor)
+{
+	return !(*cursor);
+}
diff --git a/parse.h b/parse.h
new file mode 100644
index 00000000..eb51a469
--- /dev/null
+++ b/parse.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: David Gibson 
+ */
+
+#ifndef PARSE_H
+#define PARSE_H
+
+#include 
+
+bool parse_literal(const char **cursor, const char *lit);
+bool parse_eoi(const char *cursor);
+
+#endif /* _PARSE_H */
-- 
Stefano
-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson