Currently UDP only has a very rudimentary (and buggy) form of connection tracking implemented with per-port flags. Make a start on converting this to more robust tracking via the flow table. Start matching UDP packets to flow table entries, creating them when necessary. We also add a timer so that the flows will expire. For now don't actually use the information in the flow table, that will come later. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- Makefile | 2 +- flow.c | 31 +++++++ flow.h | 4 + flow_table.h | 2 + udp.c | 228 ++++++++++++++++++++++++++++++++++++++++++++++++--- udp_flow.h | 25 ++++++ 6 files changed, 278 insertions(+), 14 deletions(-) create mode 100644 udp_flow.h diff --git a/Makefile b/Makefile index 09fc461d..92cbd5a6 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h udp_flow.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/flow.c b/flow.c index cf799082..c3d54f6e 100644 --- a/flow.c +++ b/flow.c @@ -35,6 +35,7 @@ const char *flow_type_str[] = { [FLOW_TCP_SPLICE] = "TCP connection (spliced)", [FLOW_PING4] = "ICMP ping sequence", [FLOW_PING6] = "ICMPv6 ping sequence", + [FLOW_UDP] = "UDP flow", }; static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); @@ -44,6 +45,7 @@ const uint8_t flow_proto[] = { [FLOW_TCP_SPLICE] = IPPROTO_TCP, [FLOW_PING4] = IPPROTO_ICMP, [FLOW_PING6] = IPPROTO_ICMPV6, + [FLOW_UDP] = IPPROTO_UDP, }; static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); @@ -641,6 +643,31 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, return flowside_lookup(c, proto, pif, &fside); } +/** + * flow_lookup_sa() - Look up a flow given and endpoint socket address + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: Interface of the flow + * @esa: Socket address of the endpoint + * @fport: Forwarding port number + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, + const void *esa, in_port_t fport) +{ + struct flowside fside = { + .fport = fport, + }; + + inany_from_sockaddr(&fside.eaddr, &fside.eport, esa); + if (inany_v4(&fside.eaddr)) + fside.faddr = inany_any4; + else + fside.faddr = inany_any6; + return flowside_lookup(c, proto, pif, &fside); +} + /** * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context @@ -720,6 +747,10 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) if (timer) closed = icmp_ping_timer(c, &flow->ping, now); break; + case FLOW_UDP: + if (timer) + closed = udp_flow_timer(c, &flow->udp, now); + break; default: /* Assume other flow types don't need any handling */ ; diff --git a/flow.h b/flow.h index 948f2ea9..b5ca2792 100644 --- a/flow.h +++ b/flow.h @@ -115,6 +115,8 @@ enum flow_type { FLOW_PING4, /* ICMPv6 echo requests from guest to host and matching replies back */ FLOW_PING6, + /* UDP packets with the matching unicast endpoints */ + FLOW_UDP, FLOW_NUM_TYPES, }; @@ -227,6 +229,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, uint8_t proto, uint8_t pif, sa_family_t af, const void *eaddr, const void *faddr, in_port_t eport, in_port_t fport); +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, + const void *esa, in_port_t fport); union flow; diff --git a/flow_table.h b/flow_table.h index 07c59041..6cf4f2b7 100644 --- a/flow_table.h +++ b/flow_table.h @@ -9,6 +9,7 @@ #include "tcp_conn.h" #include "icmp_flow.h" +#include "udp_flow.h" /** * struct flow_free_cluster - Information about a cluster of free entries @@ -35,6 +36,7 @@ union flow { struct tcp_tap_conn tcp; struct tcp_splice_conn tcp_splice; struct icmp_ping_flow ping; + struct udp_flow udp; }; /* Global Flow Table */ diff --git a/udp.c b/udp.c index e79ca938..cb6db5c5 100644 --- a/udp.c +++ b/udp.c @@ -15,10 +15,49 @@ /** * DOC: Theory of Operation * + * Flow Table + * ========== * - * For UDP, a reduced version of port-based connection tracking is implemented - * with two purposes: - * - binding ephemeral ports when they're used as source port by the guest, so + * UDP does not have connections, but to reliably forward reply packets back to + * the original requested, we must keep track of pseudo-connections. We do this + * via the generic flow table. + * + * - Finding an existing flow + * + * When we receive a datagram we attempt to match it to an existing flow: one + * with matching interface, addresses and ports (both forwarding and + * endpoint). For socket interfaces, we treat the forwarding address as the + * bound address of the receiving socket, which may be unspecified, rather + * than the datagram's actual destination address (which is awkward to + * determine for unbound sockets). + * + * - Creating a new flow + * + * If no matching flow exists, and the datagram comes either from the tap + * interface, or from a socket with the 'orig' flag set we create a new one. + * The initiating side records the interface, endpoint and forwarding + * addresses and ports of this first datagram. Again, we treat the forwarding + * address for sockets as the socket's bound address, regardless of the + * datagram's actual destination. + * + * The target side interface and addresses are assigned by the general code in + * fwd.c. When the target is a socket interface, the target forwarding + * address may be left unspecified - in this case, the kernel will determine + * the source address when we send the datagram. + * + * - Flow expiry + * + * Every time a datagram is received that matches a flow (or creates a new + * one), we update the flow's timestamp to the current time. Periodically we + * scan flows and those which are older than UDP_CONN_TIMEOUT (180s) are + * removed. + * + * Port Tracking + * ============= + * + * For datagrams not handled by the flow table, a reduced version of port-based + * connection tracking is implemented with two purposes: + * - binding ephemeral ports when they're used as source port by the guest, so * that replies on those ports can be forwarded back to the guest, with a * fixed timeout for this binding * - packets received from the local host get their source changed to a local @@ -121,6 +160,7 @@ #include "tap.h" #include "pcap.h" #include "log.h" +#include "flow_table.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ @@ -199,6 +239,7 @@ static struct ethhdr udp6_eth_hdr; * @taph: Tap backend specific header * @s_in: Source socket address, filled in by recvmmsg() * @splicesrc: Source port for splicing, or -1 if not spliceable + * @tosidx: sidx for the destination side of this datagram's flow */ static struct udp_meta_t { struct ipv6hdr ip6h; @@ -207,6 +248,7 @@ static struct udp_meta_t { union sockaddr_inany s_in; int splicesrc; + flow_sidx_t tosidx; } #ifdef __AVX2__ __attribute__ ((aligned(32))) @@ -253,6 +295,17 @@ static struct sockaddr_in6 udp6_localname = { static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +struct udp_flow *udp_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return NULL; + + ASSERT(flow->f.type == FLOW_UDP); + return &flow->udp; +} + /** * udp_portmap_clear() - Clear UDP port map before configuration */ @@ -492,6 +545,67 @@ static int udp_mmh_splice_port(union udp_epoll_ref uref, return -1; } +/** + * udp_flow_from_sock() - Find or create UDP flow for datagrams from socket + * @c: Execution context + * @uref: UDP epoll reference of the originating socket + * @meta: Metadata buffer for the datagram + * + * Return: sidx for the destination side of the flow for this packet, or + * FLOW_SIDX_NONE if we couldn't find or create a flow. + */ +flow_sidx_t udp_flow_from_sock(const struct ctx *c, union udp_epoll_ref uref, + struct udp_meta_t *meta) +{ + char sstr[INANY_ADDRSTRLEN]; + const struct flowside *ini; + struct udp_flow *uflow; + union flow *flow; + flow_sidx_t sidx; + + sidx = flow_lookup_sa(c, IPPROTO_UDP, uref.pif, &meta->s_in, uref.port); + if ((flow = flow_at_sidx(sidx))) + return FLOW_SIDX(flow, !sidx.side); + + if (!uref.orig) + return FLOW_SIDX_NONE; + + if (!(flow = flow_alloc())) { + char sastr[SOCKADDR_STRLEN]; + + debug("Couldn't allocate flow for UDP datagram from %s %s", + pif_name(uref.pif), + sockaddr_ntop(&meta->s_in, sastr, sizeof(sastr))); + return FLOW_SIDX_NONE; + } + + ini = flow_initiate_sa(flow, uref.pif, &meta->s_in, uref.port); + + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { + flow_dbg(flow, "Invalid endpoint on UDP recv()"); + /* Invalid endpoint */ + goto cancel; + } + + if (!flow_target(c, flow, IPPROTO_UDP)) + goto cancel; + + uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); + flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); + flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); + FLOW_ACTIVATE(uflow); + + return FLOW_SIDX(uflow, TGTSIDE); + +cancel: + flow_dbg(flow, "Couldn't create UDP flow for %s [%s]:%hu -> ?:%hu", + pif_name(uref.pif), + inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), + ini->eport, ini->fport); + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; +} + /** * udp_splice_send() - Send datagrams from socket to socket * @c: Execution context @@ -536,6 +650,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, break; udp_meta[i].splicesrc = udp_mmh_splice_port(uref, &mmh_recv[i]); + udp_meta[i].tosidx = udp_flow_from_sock(c, uref, &udp_meta[i]); } while (udp_meta[i].splicesrc == src); if (uref.pif == PIF_SPLICE) { @@ -758,6 +873,7 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, break; udp_meta[i].splicesrc = udp_mmh_splice_port(uref, &mmh_recv[i]); + udp_meta[i].tosidx = udp_flow_from_sock(c, uref, &udp_meta[i]); } while (udp_meta[i].splicesrc == -1); tap_send_frames(c, &tap_iov[start][0], UDP_NUM_IOVS, i - start); @@ -786,8 +902,8 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve */ ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); in_port_t dstport = ref.udp.port; - bool v6 = ref.udp.v6; struct mmsghdr *mmh_recv; + bool v6 = ref.udp.v6; int i, m; if (c->no_udp || !(events & EPOLLIN)) @@ -797,6 +913,8 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve dstport += c->udp.fwd_out.f.delta[dstport]; else if (ref.udp.pif == PIF_HOST) dstport += c->udp.fwd_in.f.delta[dstport]; + else + ASSERT(0); if (v6) mmh_recv = udp6_l2_mh_sock; @@ -809,12 +927,13 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve /* We divide things into batches based on how we need to send them, * determined by udp_meta[i].splicesrc. To avoid either two passes - * through the array, or recalculating splicesrc for a single entry, we - * have to populate it one entry *ahead* of the loop counter (if - * present). So we fill in entry 0 before the loop, then udp_*_send() - * populate one entry past where they consume. + * through the array, or recalculating splicesrc and tosidx for a single + * entry, we have to populate them one entry *ahead* of the loop counter + * (if present). So we fill in entry 0 before the loop, then + * udp_*_send() populate one entry past where they consume. */ udp_meta[0].splicesrc = udp_mmh_splice_port(ref.udp, mmh_recv); + udp_meta[0].tosidx = udp_flow_from_sock(c, ref.udp, &udp_meta[0]); for (i = 0; i < n; i += m) { if (udp_meta[i].splicesrc >= 0) m = udp_splice_send(c, i, n, dstport, ref.udp, now); @@ -823,6 +942,74 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve } } +/** + * udp_flow_from_tap() - Find or create UDP flow for tap packets + * @c: Execution context + * @pif: pif on which the packet is arriving + * @af: Address family, AF_INET or AF_INET6 + * @saddr: Source address on guest side + * @daddr: Destination address guest side + * @srcport: Source port on guest side + * @dstport: Destination port on guest side + * + * Return: sidx for the destination side of the flow for this packet, or + * FLOW_SIDX_NONE if we couldn't find or create a flow. + */ +flow_sidx_t udp_flow_from_tap(const struct ctx *c, + uint8_t pif, sa_family_t af, + const void *saddr, const void *daddr, + in_port_t srcport, in_port_t dstport) +{ + const struct flowside *ini; + struct udp_flow *uflow; + union flow *flow; + flow_sidx_t sidx; + + ASSERT(pif == PIF_TAP); + + sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, + srcport, dstport); + if ((flow = flow_at_sidx(sidx))) + return FLOW_SIDX(flow, !sidx.side); + + if (!(flow = flow_alloc())) + return FLOW_SIDX_NONE; + + ini = flow_initiate_af(flow, PIF_TAP, af, + saddr, srcport, daddr, dstport); + + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || + !inany_is_unicast(&ini->faddr) || ini->fport == 0) { + char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN]; + + debug("Invalid UDP endpoint from %s: %s:%hu -> %s:%hu", + pif_name(pif), + inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport, + inany_ntop(&ini->faddr, dstr, sizeof(dstr)), ini->fport); + goto cancel; + } + + if (!flow_target(c, flow, IPPROTO_UDP)) + goto cancel; + + if (flow->f.pif[TGTSIDE] != PIF_HOST) { + flow_err(flow, "No support for forwarding UDP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); + goto cancel; + } + + uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); + flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); + flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); + FLOW_ACTIVATE(uflow); + return FLOW_SIDX(uflow, TGTSIDE); + +cancel: + flow_alloc_cancel(flow); + return FLOW_SIDX_NONE; +} + /** * udp_tap_handler() - Handle packets from tap * @c: Execution context @@ -847,15 +1034,13 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, struct sockaddr_in6 s_in6; struct sockaddr_in s_in; const struct udphdr *uh; + struct udp_flow *uflow; struct sockaddr *sa; int i, s, count = 0; in_port_t src, dst; + flow_sidx_t sidx; socklen_t sl; - (void)c; - (void)saddr; - (void)pif; - uh = packet_get(p, idx, 0, sizeof(*uh), NULL); if (!uh) return 1; @@ -864,8 +1049,14 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, * and destination, so we can just take those from the first message. */ src = ntohs(uh->source); - src += c->udp.fwd_in.rdelta[src]; dst = ntohs(uh->dest); + sidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst); + if ((uflow = udp_at_sidx(sidx))) + uflow->ts = now->tv_sec; + else + debug("UDP from tap without flow"); + + src += c->udp.fwd_in.rdelta[src]; if (af == AF_INET) { s_in = (struct sockaddr_in) { @@ -1211,6 +1402,17 @@ static int udp_port_rebind_outbound(void *arg) return 0; } +bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow, + const struct timespec *now) +{ + if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) + return false; + + flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); + flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); + return true; +} + /** * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context diff --git a/udp_flow.h b/udp_flow.h new file mode 100644 index 00000000..18af9ac4 --- /dev/null +++ b/udp_flow.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + * + * UDP flow tracking data structures + */ +#ifndef UDP_FLOW_H +#define UDP_FLOW_H + +/** + * struct udp - Descriptor for a flow of UDP packets + * @f: Generic flow information + * @ts: Activity timestamp + */ +struct udp_flow { + /* Must be first element */ + struct flow_common f; + + time_t ts; +}; + +bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow, + const struct timespec *now); + +#endif /* UDP_FLOW_H */ -- 2.45.2