On Tue, Jan 13, 2026 at 12:26:36AM +0100, Stefano Brivio wrote:
On Thu, 8 Jan 2026 13:29:41 +1100 David Gibson
wrote: Previously we created inbound listening sockets as we parsed the forwarding options (-t, -u) whereas outbound listening sockets were created during {tcp,udp}_init(). Now that we have a data structure recording the full details of the listening options we can move all listening socket creation to {tcp,udp}_init(). This means that errors for either direction are detected and reported the same way.
Introduce fwd_listen_sync() which synchronizes the state of listening sockets to the forwarding rules table, both for fixed and automatic forwards.
This does cause a change in semantics for "exclude only" port specifications. Previously an option like -t ~6000 wouldn't cause a fatal error, as long as we could bind at least one port. Now, it requires at least one port for each generated rule; that is for each of the contiguous blocks of ports the specification resolves to. With typical ephemeral ports settings that's one port each in 1..5999, 6001..32767 and 61000..65535.
Preserving the exact behaviour for this case would require a considerably more complex data structure, so I'm hoping this is a sufficiently niche case for the change to be acceptable.
I guess so too, I wouldn't really worry.
Well, I'm not sure if it works, but one relatively simple idea could be to have a "with_prev" bit in the rule struct representing the fact that the current rule was derived from the same port specification as the previous rule, which implies they would need to be deleted all together (but we can happily enforce that).
Then, in the fwd_listen_sync_() loop, before reporting failure, you would check the next entry: if the "with_prev" bit is set, report failure only if we fail (keeping a local boolean flag) for all the entries up to the first one with "with_prev" unset.
I'll keep that approach in mind if it seems like we need it.
I would be inclined to say it's worth it if it's that simple, but I haven't tried, so I might be very well missing something.
I also considered making WEAK mean we'd always continue on listen failures, even if all of them fail. Maybe that's a bit unexpected? But it would allow an option to "forward single port X, if you can" which seems like it might be useful.
Signed-off-by: David Gibson
--- conf.c | 27 ---------- fwd.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- fwd.h | 3 ++ ip.c | 1 - tcp.c | 122 ++--------------------------------------- tcp.h | 1 - udp.c | 99 +++------------------------------- udp.h | 1 - 8 files changed, 177 insertions(+), 244 deletions(-) diff --git a/conf.c b/conf.c index 0bcf80d7..57693b3f 100644 --- a/conf.c +++ b/conf.c @@ -148,9 +148,7 @@ static void conf_ports_range_except(const struct ctx *c, char optname, uint8_t flags) { unsigned delta = to - first; - bool bound_one = false; unsigned base, i; - int fd;
if (first == 0) { die("Can't forward port 0 for option '-%c %s'", @@ -179,28 +177,6 @@ static void conf_ports_range_except(const struct ctx *c, char optname, warn( "Altering mapping of already mapped port number: %s", optarg); } - - if (!(flags & FWD_SCAN) && optname == 't') - fd = tcp_listen(c, PIF_HOST, addr, ifname, i); - else if (!(flags & FWD_SCAN) && optname == 'u') - fd = udp_listen(c, PIF_HOST, addr, ifname, i); - else - /* No way to check in advance for -T and -U */ - fd = 0; - - if (fd == -ENFILE || fd == -EMFILE) { - die( -"Can't open enough sockets for port specifier: %s", - optarg); - } - - if (fd >= 0) { - bound_one = true; - } else if (!(flags & FWD_WEAK)) { - die( -"Failed to bind port %u (%s) for option '-%c %s'", - i, strerror_(-fd), optname, optarg); - } }
if ((optname == 'T' || optname == 'U') && c->no_bindtodevice) { @@ -226,9 +202,6 @@ static void conf_ports_range_except(const struct ctx *c, char optname, } base = i - 1; } - - if (!bound_one) - die("Failed to bind any port for '-%c %s'", optname, optarg); }
/** diff --git a/fwd.c b/fwd.c index f27a4220..70ef73a3 100644 --- a/fwd.c +++ b/fwd.c @@ -22,6 +22,7 @@ #include
#include "util.h" +#include "epoll_ctl.h" #include "ip.h" #include "siphash.h" #include "inany.h" @@ -420,6 +421,160 @@ void fwd_rules_print(const struct fwd_ports *fwd) } }
+/** fwd_sync_one() - Create or remove listening sockets for a forward entry + * @c: Execution context + * @rule: Forwarding rule + * @pif: Interface to create listening sockets for + * @proto: Protocol to listen for + * @scanmap: Bitmap of ports to listen for on FWD_SCAN entries + * + * Return: 0 on success, -1 on failure + */ +static int fwd_sync_one(const struct ctx *c, const struct fwd_rule *rule, + uint8_t pif, uint8_t proto, const uint8_t *scanmap) +{ + const union inany_addr *addr = fwd_rule_addr(rule); + const char *ifname = rule->ifname; + bool bound_one = false; + unsigned port; + + ASSERT(pif_is_socket(pif)); + + if (!*ifname) + ifname = NULL; + + for (port = rule->first; port <= rule->last; port++) { + int fd = rule->socks[port - rule->first]; + + if ((rule->flags & FWD_SCAN) && !bitmap_isset(scanmap, port)) { + /* We don't want to listen on this port */ + if (fd >= 0) { + /* We already are, so stop */ + epoll_del(c->epollfd, fd); + close(fd); + rule->socks[port - rule->first] = -1; + } + continue; + } + + if (fd >= 0) /* Already listening, nothing to do */ { + bound_one = true; + continue; + } + + if (proto == IPPROTO_TCP) + fd = tcp_listen(c, pif, addr, ifname, port); + else if (proto == IPPROTO_UDP) + fd = udp_listen(c, pif, addr, ifname, port); + else + ASSERT(0); + + if (fd < 0) { + char astr[INANY_ADDRSTRLEN] = "";
Should we perhaps make this "*" for consistency with fwd_rules_print()?
Good idea, that simplifies things a bit too. This code predates my extension to inany_ntop(), and I forgot to rework it to take advantage.
+ + if (addr) + inany_ntop(addr, astr, sizeof(astr)); + + warn("Listen failed for %s %s port %s%s%s%s%u: %s", + pif_name(pif), ipproto_name(proto), + astr, ifname ? "%" : "", ifname ? ifname : "", + addr || ifname ? "/" : "", port, strerror_(-fd)); + + if (!(rule->flags & FWD_WEAK)) + return -1; + + continue; + } + + rule->socks[port - rule->first] = fd; + bound_one = true; + } + + if (!bound_one && !(rule->flags & FWD_SCAN)) { + char astr[INANY_ADDRSTRLEN] = "";
Same here.
Done.
+ + if (addr) + inany_ntop(addr, astr, sizeof(astr)); + + warn("All listens failed for %s %s %s%s%s%s%u-%u", + pif_name(pif), ipproto_name(proto), + astr, ifname ? "%" : "", ifname ? ifname : "", + addr || ifname ? "/" : "", rule->first, rule->last); + return -1; + } + + return 0; +} + +/** struct fwd_listen_args - arguments for fwd_listen_init_() + * @c: Execution context + * @fwd: Forwarding information + * @scanmap: Bitmap of ports to auto-forward + * @pif: Interface to create listening sockets for + * @proto: Protocol + * @ret: Return code + */ +struct fwd_listen_args { + const struct ctx *c; + const struct fwd_ports *fwd; + const uint8_t *scanmap; + uint8_t pif; + uint8_t proto; + int ret; +}; + +/** fwd_listen_sync_() - Update listening sockets to match forwards + * @arg: struct fwd_listen_args with arguments + * + * Returns: zero + */ +static int fwd_listen_sync_(void *arg) +{ + struct fwd_listen_args *a = arg; + unsigned i; + + if (a->pif == PIF_SPLICE) + ns_enter(a->c); + + for (i = 0; i < a->fwd->count; i++) { + a->ret = fwd_sync_one(a->c, &a->fwd->rules[i], + a->pif, a->proto, a->fwd->map); + if (a->ret < 0) + break; + } + + return 0; +} + +/** fwd_listen_sync() - Update listening sockets to match forwards
This has the same description as fwd_listen_sync_() and it might be quite hard to understand the difference if one is not used to spot the "void *arg" argument. What about:
/** fwd_listen_sync() - Call fwd_listen_sync_() in the intended namespace
?
Fair point, done.
+ * @c: Execution context + * @fwd: Forwarding information + * @pif: Interface to create listening sockets for + * @proto: Protocol + * + * Return: 0 on success, -1 on failure + */ +int fwd_listen_sync(const struct ctx *c, const struct fwd_ports *fwd, + uint8_t pif, uint8_t proto) +{ + struct fwd_listen_args a = { + .c = c, .fwd = fwd, .pif = pif, .proto = proto, + }; + + if (pif == PIF_SPLICE) + NS_CALL(fwd_listen_sync_, &a); + else + fwd_listen_sync_(&a); + + if (a.ret < 0) { + err("Couldn't listen on requested %s ports", + ipproto_name(proto)); + return -1; + } + + return 0; +} + /* See enum in kernel's include/net/tcp_states.h */ #define UDP_LISTEN 0x07 #define TCP_LISTEN 0x0a @@ -578,10 +733,14 @@ void fwd_scan_ports_timer(struct ctx *c, const struct timespec *now)
fwd_scan_ports(c);
- if (!c->no_tcp) - tcp_port_rebind_all(c); - if (!c->no_udp) - udp_port_rebind_all(c); + if (!c->no_tcp) { + fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP); + fwd_listen_sync(c, &c->tcp.fwd_out, PIF_SPLICE, IPPROTO_TCP); + } + if (!c->no_udp) { + fwd_listen_sync(c, &c->udp.fwd_in, PIF_HOST, IPPROTO_UDP); + fwd_listen_sync(c, &c->udp.fwd_out, PIF_SPLICE, IPPROTO_UDP); + } }
/** diff --git a/fwd.h b/fwd.h index 3ddcb91d..f84e7c01 100644 --- a/fwd.h +++ b/fwd.h @@ -108,6 +108,9 @@ void fwd_rules_print(const struct fwd_ports *fwd); void fwd_scan_ports_init(struct ctx *c); void fwd_scan_ports_timer(struct ctx * c, const struct timespec *now);
+int fwd_listen_sync(const struct ctx *c, const struct fwd_ports *fwd, + uint8_t pif, uint8_t proto); + bool nat_inbound(const struct ctx *c, const union inany_addr *addr, union inany_addr *translated); uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto, diff --git a/ip.c b/ip.c index f1d224bd..fc26dab2 100644 --- a/ip.c +++ b/ip.c @@ -78,7 +78,6 @@ found: * /etc/protocols and might allocate, which isn't possible for us once * self-isolated. */ -/* cppcheck-suppress unusedFunction */ const char *ipproto_name(uint8_t proto) { switch (proto) { diff --git a/tcp.c b/tcp.c index 57faed4b..976f0ab7 100644 --- a/tcp.c +++ b/tcp.c @@ -2732,50 +2732,6 @@ int tcp_listen(const struct ctx *c, uint8_t pif, return s; }
-/** - * tcp_ns_listen() - Init socket to listen for spliced outbound connections - * @c: Execution context - * @port: Port, host order - */ -static void tcp_ns_listen(const struct ctx *c, in_port_t port) -{ - ASSERT(!c->no_tcp); - - if (!c->no_bindtodevice) { - tcp_listen(c, PIF_SPLICE, NULL, "lo", port); - return; - } - - if (c->ifi4) - tcp_listen(c, PIF_SPLICE, &inany_loopback4, NULL, port); - if (c->ifi6) - tcp_listen(c, PIF_SPLICE, &inany_loopback6, NULL, port); -} - -/** - * tcp_ns_socks_init() - Bind sockets in namespace for outbound connections - * @arg: Execution context - * - * Return: 0 - */ -/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ -static int tcp_ns_socks_init(void *arg) -{ - const struct ctx *c = (const struct ctx *)arg; - unsigned port; - - ns_enter(c); - - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(c->tcp.fwd_out.map, port)) - continue; - - tcp_ns_listen(c, port); - } - - return 0; -} - /** * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets * @pool: Pool of sockets to refill @@ -2919,10 +2875,13 @@ int tcp_init(struct ctx *c)
tcp_sock_refill_init(c);
+ if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0) + return -1;
This needs an update to the function comment (which currently says "Return: 0, doesn't return on failure").
* Return: 0, doesn't return on failure
Fixed.
if (c->mode == MODE_PASTA) { tcp_splice_init(c); - - NS_CALL(tcp_ns_socks_init, c); + if (fwd_listen_sync(c, &c->tcp.fwd_out, + PIF_SPLICE, IPPROTO_TCP) < 0) + return -1; }
peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) && @@ -2941,77 +2900,6 @@ int tcp_init(struct ctx *c) return 0; }
-/** - * tcp_port_rebind() - Rebind ports to match forward maps - * @c: Execution context - * @outbound: True to remap outbound forwards, otherwise inbound - * - * Must be called in namespace context if @outbound is true. - */ -static void tcp_port_rebind(struct ctx *c, bool outbound) -{ - const uint8_t *fmap = outbound ? c->tcp.fwd_out.map : c->tcp.fwd_in.map; - int (*socks)[IP_VERSIONS] = outbound ? tcp_sock_ns : tcp_sock_init_ext; - unsigned port; - - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(fmap, port)) { - if (socks[port][V4] >= 0) { - close(socks[port][V4]); - socks[port][V4] = -1; - } - - if (socks[port][V6] >= 0) { - close(socks[port][V6]); - socks[port][V6] = -1; - } - - continue; - } - - if ((c->ifi4 && socks[port][V4] == -1) || - (c->ifi6 && socks[port][V6] == -1)) { - if (outbound) - tcp_ns_listen(c, port); - else - tcp_listen(c, PIF_HOST, NULL, NULL, port); - } - } -} - -/** - * tcp_port_rebind_outbound() - Rebind ports in namespace - * @arg: Execution context - * - * Called with NS_CALL() - * - * Return: 0 - */ -static int tcp_port_rebind_outbound(void *arg) -{ - struct ctx *c = (struct ctx *)arg; - - ns_enter(c); - tcp_port_rebind(c, true); - - return 0; -} - -/** - * tcp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) - * @c: Execution context - */ -void tcp_port_rebind_all(struct ctx *c) -{ - ASSERT(c->mode == MODE_PASTA && !c->no_tcp); - - if (c->tcp.fwd_out.mode == FWD_AUTO) - NS_CALL(tcp_port_rebind_outbound, c); - - if (c->tcp.fwd_in.mode == FWD_AUTO) - tcp_port_rebind(c, false); -} - /** * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill * @c: Execution context diff --git a/tcp.h b/tcp.h index ef1e3544..45f97d93 100644 --- a/tcp.h +++ b/tcp.h @@ -22,7 +22,6 @@ int tcp_listen(const struct ctx *c, uint8_t pif, const union inany_addr *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); -void tcp_port_rebind_all(struct ctx *c); void tcp_timer(const struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c);
diff --git a/udp.c b/udp.c index d7dcb1d2..7c5546df 100644 --- a/udp.c +++ b/udp.c @@ -1203,98 +1203,6 @@ static void udp_splice_iov_init(void) } }
-/** - * udp_ns_listen() - Init socket to listen for spliced outbound connections - * @c: Execution context - * @port: Port, host order - */ -static void udp_ns_listen(const struct ctx *c, in_port_t port) -{ - ASSERT(!c->no_udp); - - if (!c->no_bindtodevice) { - udp_listen(c, PIF_SPLICE, NULL, "lo", port); - return; - } - - if (c->ifi4) - udp_listen(c, PIF_SPLICE, &inany_loopback4, NULL, port); - if (c->ifi6) - udp_listen(c, PIF_SPLICE, &inany_loopback6, NULL, port); -} - -/** - * udp_port_rebind() - Rebind ports to match forward maps - * @c: Execution context - * @outbound: True to remap outbound forwards, otherwise inbound - * - * Must be called in namespace context if @outbound is true. - */ -static void udp_port_rebind(struct ctx *c, bool outbound) -{ - int (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init; - const uint8_t *fmap - = outbound ? c->udp.fwd_out.map : c->udp.fwd_in.map; - unsigned port; - - for (port = 0; port < NUM_PORTS; port++) { - if (!bitmap_isset(fmap, port)) { - if (socks[V4][port] >= 0) { - close(socks[V4][port]); - socks[V4][port] = -1; - } - - if (socks[V6][port] >= 0) { - close(socks[V6][port]); - socks[V6][port] = -1; - } - - continue; - } - - if ((c->ifi4 && socks[V4][port] == -1) || - (c->ifi6 && socks[V6][port] == -1)) { - if (outbound) - udp_ns_listen(c, port); - else - udp_listen(c, PIF_HOST, NULL, NULL, port); - } - } -} - -/** - * udp_port_rebind_outbound() - Rebind ports in namespace - * @arg: Execution context - * - * Called with NS_CALL() - * - * Return: 0 - */ -static int udp_port_rebind_outbound(void *arg) -{ - struct ctx *c = (struct ctx *)arg; - - ns_enter(c); - udp_port_rebind(c, true); - - return 0; -} - -/** - * udp_port_rebind_all() - Rebind ports to match forward maps (in host & ns) - * @c: Execution context - */ -void udp_port_rebind_all(struct ctx *c) -{ - ASSERT(c->mode == MODE_PASTA && !c->no_udp); - - if (c->udp.fwd_out.mode == FWD_AUTO) - NS_CALL(udp_port_rebind_outbound, c); - - if (c->udp.fwd_in.mode == FWD_AUTO) - udp_port_rebind(c, false); -} - /** * udp_init() - Initialise per-socket data, and sockets in namespace * @c: Execution context @@ -1307,9 +1215,14 @@ int udp_init(struct ctx *c)
udp_iov_init(c);
+ if (fwd_listen_sync(c, &c->udp.fwd_in, PIF_HOST, IPPROTO_UDP) < 0) + return -1;
Same here, update to the function comment needed.
Fixed.
+ if (c->mode == MODE_PASTA) { udp_splice_iov_init(); - NS_CALL(udp_port_rebind_outbound, c); + if (fwd_listen_sync(c, &c->udp.fwd_out, + PIF_SPLICE, IPPROTO_UDP) < 0) + return -1; }
return 0; diff --git a/udp.h b/udp.h index 94c698e2..73efe036 100644 --- a/udp.h +++ b/udp.h @@ -19,7 +19,6 @@ int udp_listen(const struct ctx *c, uint8_t pif, const union inany_addr *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); -void udp_port_rebind_all(struct ctx *c); void udp_update_l2_buf(const unsigned char *eth_d);
/**
-- Stefano
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson