On Tue, 14 May 2024 11:03:36 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:Currently the code to translate host side addresses and ports to guest side addresses and ports, and vice versa, is scattered across the TCP code. This includes both port redirection as controlled by the -t and -T options, and our special case NAT controlled by the --no-map-gw option. Gather this logic into fwd_from_*() functions for each input interface in fwd.c which take protocol and address information for the initiating side and generates the pif and address information for the forwarded side. This performs any NAT or port forwarding needed. We create a flow_forward() helper which applies those forwarding functions as needed to automatically move a flow from INI to FWD state. For now we leave the older flow_forward_af() function taking explicit addresses as a transitional tool. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 53 +++++++++++++++++++++++++ flow_table.h | 2 + fwd.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ fwd.h | 12 ++++++ tcp.c | 102 +++++++++++++++-------------------------------- tcp_splice.c | 63 ++--------------------------- tcp_splice.h | 5 +-- 7 files changed, 213 insertions(+), 134 deletions(-) diff --git a/flow.c b/flow.c index 4942075..a6afe39 100644 --- a/flow.c +++ b/flow.c @@ -304,6 +304,59 @@ const struct flowside *flow_forward_af(union flow *flow, uint8_t pif, return fwd; } + +/** + * flow_forward() - Determine where flow should forward to, and move to FWD + * @c: Execution context + * @flow: Flow to forward + * @proto: Protocol + * + * Return: pointer to the forwarded flowside information + */ +const struct flowside *flow_forward(const struct ctx *c, union flow *flow, + uint8_t proto) +{ + char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; + struct flow_common *f = &flow->f; + const struct flowside *ini = &f->side[INISIDE]; + struct flowside *fwd = &f->side[FWDSIDE]; + uint8_t pif1 = PIF_NONE;This could now be 'pif_fwd' / 'pif_tgt', right?+ + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI); + ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[FWDSIDE] == PIF_NONE); + ASSERT(flow->f.state == FLOW_STATE_INI); + + switch (f->pif[INISIDE]) { + case PIF_TAP: + pif1 = fwd_from_tap(c, proto, ini, fwd); + break; + + case PIF_SPLICE: + pif1 = fwd_from_splice(c, proto, ini, fwd); + break; + + case PIF_HOST: + pif1 = fwd_from_host(c, proto, ini, fwd); + break; + + default: + flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu", + pif_name(f->pif[INISIDE]), + inany_ntop(&ini->eaddr, estr, sizeof(estr)), + ini->eport, + inany_ntop(&ini->faddr, fstr, sizeof(fstr)), + ini->fport); + } + + if (pif1 == PIF_NONE) + return NULL; + + f->pif[FWDSIDE] = pif1; + flow_set_state(f, FLOW_STATE_FWD); + return fwd; +} + /** * flow_set_type() - Set type and move to TYPED state * @flow: Flow to change state diff --git a/flow_table.h b/flow_table.h index d17ffba..3ac0b8c 100644 --- a/flow_table.h +++ b/flow_table.h @@ -118,6 +118,8 @@ const struct flowside *flow_forward_af(union flow *flow, uint8_t pif, sa_family_t af, const void *saddr, in_port_t sport, const void *daddr, in_port_t dport); +const struct flowside *flow_forward(const struct ctx *c, union flow *flow, + uint8_t proto); union flow *flow_set_type(union flow *flow, enum flow_type type); #define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) diff --git a/fwd.c b/fwd.c index b3d5a37..5fe2361 100644 --- a/fwd.c +++ b/fwd.c @@ -25,6 +25,7 @@ #include "fwd.h" #include "passt.h" #include "lineread.h" +#include "flow_table.h" /* See enum in kernel's include/net/tcp_states.h */ #define UDP_LISTEN 0x07 @@ -154,3 +155,112 @@ void fwd_scan_ports_init(struct ctx *c) &c->tcp.fwd_out, &c->tcp.fwd_in); } } + +uint8_t fwd_from_tap(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b)A function comment would be nice to have, albeit a bit redundant. Now 'a' and 'b' could also be called 'ini' and 'tgt' I guess?+{ + (void)proto; + + b->eaddr = a->faddr; + b->eport = a->fport; + + if (!c->no_map_gw) { + struct in_addr *v4 = inany_v4(&b->eaddr); + + if (v4 && IN4_ARE_ADDR_EQUAL(v4, &c->ip4.gw)) + *v4 = in4addr_loopback; + if (IN6_ARE_ADDR_EQUAL(&b->eaddr, &c->ip6.gw)) + b->eaddr.a6 = in6addr_loopback;I haven't tested this, but I'm a bit lost: I thought that in this case we would also set b->faddr here. Where does that happen?+ } + + return PIF_HOST; +} + +uint8_t fwd_from_splice(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b) +{ + const struct in_addr *ae4 = inany_v4(&a->eaddr); + + if (!inany_is_loopback(&a->eaddr) || + (!inany_is_loopback(&a->faddr) && !inany_is_unspecified(&a->faddr))) { + char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; + + debug("Non loopback address on %s: [%s]:%hu -> [%s]:%hu", + pif_name(PIF_SPLICE), + inany_ntop(&a->eaddr, estr, sizeof(estr)), a->eport, + inany_ntop(&a->faddr, fstr, sizeof(fstr)), a->fport); + return PIF_NONE; + } + + if (ae4) + inany_from_af(&b->eaddr, AF_INET, &in4addr_loopback); + else + inany_from_af(&b->eaddr, AF_INET6, &in6addr_loopback); + + b->eport = a->fport; + + if (proto == IPPROTO_TCP) + b->eport += c->tcp.fwd_out.delta[b->eport]; + + return PIF_HOST; +} + +uint8_t fwd_from_host(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b) +{ + struct in_addr *bf4; + + if (c->mode == MODE_PASTA && inany_is_loopback(&a->eaddr) && + proto == IPPROTO_TCP) { + /* spliceable */Before we conclude this, does f->pif[INISIDE] == PIF_HOST in the caller guarantee that inany_is_loopback(&a->faddr), too? If not, we shouldn't splice unless that's true as well.+ b->faddr = a->eaddr; + + if (inany_v4(&a->eaddr)) + inany_from_af(&b->eaddr, AF_INET, &in4addr_loopback); + else + inany_from_af(&b->eaddr, AF_INET6, &in6addr_loopback); + b->eport = a->fport; + if (proto == IPPROTO_TCP) + b->eport += c->tcp.fwd_in.delta[b->eport]; + + return PIF_SPLICE; + } + + b->faddr = a->eaddr; + b->fport = a->eport; + + bf4 = inany_v4(&b->faddr); + + if (bf4) { + if (IN4_IS_ADDR_LOOPBACK(bf4) || + IN4_IS_ADDR_UNSPECIFIED(bf4) || + IN4_ARE_ADDR_EQUAL(bf4, &c->ip4.addr_seen)) + *bf4 = c->ip4.gw; + } else { + struct in6_addr *bf6 = &b->faddr.a6; + + if (IN6_IS_ADDR_LOOPBACK(bf6) || + IN6_ARE_ADDR_EQUAL(bf6, &c->ip6.addr_seen) || + IN6_ARE_ADDR_EQUAL(bf6, &c->ip6.addr)) { + if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) + *bf6 = c->ip6.gw; + else + *bf6 = c->ip6.addr_ll; + } + } + + if (bf4) { + inany_from_af(&b->eaddr, AF_INET, &c->ip4.addr_seen); + } else { + if (IN6_IS_ADDR_LINKLOCAL(&b->faddr.a6)) + b->eaddr.a6 = c->ip6.addr_ll_seen; + else + b->eaddr.a6 = c->ip6.addr_seen; + } + + b->eport = a->fport; + if (proto == IPPROTO_TCP) + b->eport += c->tcp.fwd_in.delta[b->eport];As we do this in any case, spliced or not spliced, I would find it less confusing to have these assignments in common, earlier (I just spent half an hour trying to figure out why you wouldn't set b->eport for the non-spliced case...).+ + return PIF_TAP; +} diff --git a/fwd.h b/fwd.h index 41645d7..eefe0f0 100644 --- a/fwd.h +++ b/fwd.h @@ -7,6 +7,8 @@ #ifndef FWD_H #define FWD_H +struct flowside; + /* Number of ports for both TCP and UDP */ #define NUM_PORTS (1U << 16) @@ -42,4 +44,14 @@ void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev, const struct fwd_ports *tcp_rev); void fwd_scan_ports_init(struct ctx *c); +uint8_t fwd_from_tap(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b); +uint8_t fwd_from_splice(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b); +uint8_t fwd_from_host(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b); + +bool fwd_nat_flow(const struct ctx *c, uint8_t proto, + const struct flowside *a, struct flowside *b); + #endif /* FWD_H */ diff --git a/tcp.c b/tcp.c index 91b8a46..7e08b53 100644 --- a/tcp.c +++ b/tcp.c @@ -1759,7 +1759,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, in_port_t dstport = ntohs(th->dest); const struct flowside *ini, *fwd; struct tcp_tap_conn *conn; - union inany_addr dstaddr; /* FIXME: Avoid bulky temporary */ union sockaddr_inany sa; union flow *flow; int s = -1, mss; @@ -1782,22 +1781,18 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } - if ((s = tcp_conn_sock(c, af)) < 0) + if (!(fwd = flow_forward(c, flow, IPPROTO_TCP))) goto cancel; - dstaddr = ini->faddr; - - if (!c->no_map_gw) { - struct in_addr *v4 = inany_v4(&dstaddr); - - if (v4 && IN4_ARE_ADDR_EQUAL(v4, &c->ip4.gw)) - *v4 = in4addr_loopback; - if (IN6_ARE_ADDR_EQUAL(&dstaddr, &c->ip6.gw)) - dstaddr.a6 = in6addr_loopback; + if (flow->f.pif[FWDSIDE] != PIF_HOST) { + flow_err(flow, "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[FWDSIDE])); + goto cancel; } - fwd = flow_forward_af(flow, PIF_HOST, AF_INET6, - &inany_any6, srcport, &dstaddr, dstport); + if ((s = tcp_conn_sock(c, af)) < 0) + goto cancel; if (IN6_IS_ADDR_LINKLOCAL(&fwd->eaddr)) { struct sockaddr_in6 addr6_ll = { @@ -2479,70 +2474,21 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) conn_flag(c, conn, ACK_FROM_TAP_DUE); } -/** - * tcp_snat_inbound() - Translate source address for inbound data if needed - * @c: Execution context - * @addr: Source address of inbound packet/connection - */ -static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) -{ - struct in_addr *addr4 = inany_v4(addr); - - if (addr4) { - if (IN4_IS_ADDR_LOOPBACK(addr4) || - IN4_IS_ADDR_UNSPECIFIED(addr4) || - IN4_ARE_ADDR_EQUAL(addr4, &c->ip4.addr_seen)) - *addr4 = c->ip4.gw; - } else { - struct in6_addr *addr6 = &addr->a6; - - if (IN6_IS_ADDR_LOOPBACK(addr6) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr)) { - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - *addr6 = c->ip6.gw; - else - *addr6 = c->ip6.addr_ll; - } - } -} - /** * tcp_tap_conn_from_sock() - Initialize state for non-spliced connection * @c: Execution context - * @dstport: Destination port for connection (host side) * @flow: flow to initialise * @s: Accepted socket * @sa: Peer socket address (from accept()) * @now: Current timestamp */ -static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, - union flow *flow, int s, - const union sockaddr_inany *sa, +static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, const struct timespec *now) { - union inany_addr saddr, daddr; /* FIXME: avoid bulky temporaries */ - struct tcp_tap_conn *conn; - in_port_t srcport; + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); uint64_t hash; - inany_from_sockaddr(&saddr, &srcport, sa); - tcp_snat_inbound(c, &saddr); - - if (inany_v4(&saddr)) { - inany_from_af(&daddr, AF_INET, &c->ip4.addr_seen); - } else { - if (IN6_IS_ADDR_LINKLOCAL(&saddr)) - daddr.a6 = c->ip6.addr_ll_seen; - else - daddr.a6 = c->ip6.addr_seen; - } - dstport += c->tcp.fwd_in.delta[dstport]; - - flow_forward_af(flow, PIF_TAP, AF_INET6, - &saddr, srcport, &daddr, dstport); - conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); - +Excess newline and tab.conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2585,8 +2531,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, ref.tcp_listen.port); - ini = &flow->f.side[INISIDE]; + ini = flow_initiate_sa(flow, ref.tcp_listen.pif, &sa, ref.tcp_listen.port); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { char str[INANY_ADDRSTRLEN]; @@ -2596,11 +2541,26 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, goto cancel; } - if (tcp_splice_conn_from_sock(c, ref.tcp_listen.pif, - ref.tcp_listen.port, flow, s, &sa)) - return; + if (!flow_forward(c, flow, IPPROTO_TCP)) + goto cancel; + + switch (flow->f.pif[FWDSIDE]) { + case PIF_SPLICE: + case PIF_HOST: + tcp_splice_conn_from_sock(c, flow, s); + break; + + case PIF_TAP: + tcp_tap_conn_from_sock(c, flow, s, now); + break; + + default: + flow_err(flow, "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[FWDSIDE])); + goto cancel; + } - tcp_tap_conn_from_sock(c, ref.tcp_listen.port, flow, s, &sa, now); return; cancel: diff --git a/tcp_splice.c b/tcp_splice.c index aa92325..a0581f0 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -395,71 +395,18 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) /** * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection * @c: Execution context - * @pif0: pif id of side 0 - * @dstport: Side 0 destination port of connection * @flow: flow to initialise * @s0: Accepted (side 0) socket * @sa: Peer address of connection * - * Return: true if able to create a spliced connection, false otherwiseNot related to this patch, but I think we should probably describe in the theory of operation for flows what's the threshold between calling flow_alloc_cancel() on a flow (which would imply returning something here, in case tcp_splice_connect() fails), and deferring that instead to a CLOSING state.* #syscalls:pasta setsockopt */ -bool tcp_splice_conn_from_sock(const struct ctx *c, - uint8_t pif0, in_port_t dstport, - union flow *flow, int s0, - const union sockaddr_inany *sa) +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) { - struct tcp_splice_conn *conn; - union inany_addr src; - in_port_t srcport; - sa_family_t af; - uint8_t pif1; + struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, + tcp_splice); - if (c->mode != MODE_PASTA) - return false; - - inany_from_sockaddr(&src, &srcport, sa); - af = inany_v4(&src) ? AF_INET : AF_INET6; - - switch (pif0) { - case PIF_SPLICE: - if (!inany_is_loopback(&src)) { - char str[INANY_ADDRSTRLEN]; - - /* We can't use flow_err() etc. because we haven't set - * the flow type yet - */ - warn("Bad source address %s for splice, closing", - inany_ntop(&src, str, sizeof(str))); - - /* We *don't* want to fall back to tap */ - flow_alloc_cancel(flow); - return true; - } - - pif1 = PIF_HOST; - dstport += c->tcp.fwd_out.delta[dstport]; - break; - - case PIF_HOST: - if (!inany_is_loopback(&src)) - return false; - - pif1 = PIF_SPLICE; - dstport += c->tcp.fwd_in.delta[dstport]; - break; - - default: - return false; - } - - if (af == AF_INET) - flow_forward_af(flow, pif1, AF_INET, - NULL, 0, &in4addr_loopback, dstport); - else - flow_forward_af(flow, pif1, AF_INET6, - NULL, 0, &in6addr_loopback, dstport); - conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice); + ASSERT(c->mode == MODE_PASTA); conn->s[0] = s0; conn->s[1] = -1; @@ -473,8 +420,6 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, conn_flag(c, conn, CLOSING); FLOW_ACTIVATE(conn); - - return true; } /** diff --git a/tcp_splice.h b/tcp_splice.h index ed8f0c5..a20f3e2 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -11,10 +11,7 @@ union sockaddr_inany; void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -bool tcp_splice_conn_from_sock(const struct ctx *c, - uint8_t pif0, in_port_t dstport, - union flow *flow, int s0, - const union sockaddr_inany *sa); +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0); void tcp_splice_init(struct ctx *c); #endif /* TCP_SPLICE_H */-- Stefano