The redesign of UDP flows required (or at least, suggested) a new batch of prelininary changes that don't rely on the core of the flow table rework. Changes since v1: * Assorted minor fixes based on Stefano's feedback * Moved test programs from contrib/ to doc/ David Gibson (11): util: sock_l4() determine protocol from epoll type rather than the reverse flow: Add flow_sidx_valid() helper udp: Pass full epoll reference through more of sock handler path udp: Rename IOV and mmsghdr arrays udp: Unify udp[46]_mh_splice udp: Unify udp[46]_l2_iov udp: Don't repeatedly initialise udp[46]_eth_hdr udp: Move some more of sock_handler tasks into sub-functions udp: Consolidate datagram batching doc: Add program to document and test assumptions about SO_REUSEADDR doc: Test behaviour of zero length datagram recv()s doc/platform-requirements/.gitignore | 2 + doc/platform-requirements/Makefile | 45 +++ doc/platform-requirements/README | 18 + doc/platform-requirements/common.c | 66 ++++ doc/platform-requirements/common.h | 47 +++ doc/platform-requirements/recv-zero.c | 74 ++++ .../reuseaddr-priority.c | 240 ++++++++++++ epoll_type.h | 41 ++ flow.h | 11 + flow_table.h | 2 +- icmp.c | 2 +- passt.h | 32 -- tcp.c | 17 +- udp.c | 365 +++++++++--------- util.c | 48 +-- util.h | 3 +- 16 files changed, 756 insertions(+), 257 deletions(-) create mode 100644 doc/platform-requirements/.gitignore create mode 100644 doc/platform-requirements/Makefile create mode 100644 doc/platform-requirements/README create mode 100644 doc/platform-requirements/common.c create mode 100644 doc/platform-requirements/common.h create mode 100644 doc/platform-requirements/recv-zero.c create mode 100644 doc/platform-requirements/reuseaddr-priority.c create mode 100644 epoll_type.h -- 2.45.2
sock_l4() creates a socket of the given IP protocol number, and adds it to the epoll state. Currently it determines the correct tag for the epoll data based on the protocol. However, we have some future cases where we might want different semantics, and therefore epoll types, for sockets of the same protocol. So, change sock_l4() to take the epoll type as an explicit parameter, and determine the protocol from that. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- epoll_type.h | 41 +++++++++++++++++++++++++++++++++++++++++ icmp.c | 2 +- passt.h | 32 -------------------------------- tcp.c | 10 +++++----- udp.c | 12 ++++++------ util.c | 48 ++++++++++++++++++++++++++---------------------- util.h | 3 ++- 7 files changed, 81 insertions(+), 67 deletions(-) create mode 100644 epoll_type.h diff --git a/epoll_type.h b/epoll_type.h new file mode 100644 index 00000000..b6c04199 --- /dev/null +++ b/epoll_type.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + */ + +#ifndef EPOLL_TYPE_H +#define EPOLL_TYPE_H + +/** + * enum epoll_type - Different types of fds we poll over + */ +enum epoll_type { + /* Special value to indicate an invalid type */ + EPOLL_TYPE_NONE = 0, + /* Connected TCP sockets */ + EPOLL_TYPE_TCP, + /* Connected TCP sockets (spliced) */ + EPOLL_TYPE_TCP_SPLICE, + /* Listening TCP sockets */ + EPOLL_TYPE_TCP_LISTEN, + /* timerfds used for TCP timers */ + EPOLL_TYPE_TCP_TIMER, + /* UDP sockets */ + EPOLL_TYPE_UDP, + /* ICMP/ICMPv6 ping sockets */ + EPOLL_TYPE_PING, + /* inotify fd watching for end of netns (pasta) */ + EPOLL_TYPE_NSQUIT_INOTIFY, + /* timer fd watching for end of netns, fallback for inotify (pasta) */ + EPOLL_TYPE_NSQUIT_TIMER, + /* tuntap character device */ + EPOLL_TYPE_TAP_PASTA, + /* socket connected to qemu */ + EPOLL_TYPE_TAP_PASST, + /* socket listening for qemu socket connections */ + EPOLL_TYPE_TAP_LISTEN, + + EPOLL_NUM_TYPES, +}; + +#endif /* EPOLL_TYPE_H */ diff --git a/icmp.c b/icmp.c index 80330f6f..d4ccc722 100644 --- a/icmp.c +++ b/icmp.c @@ -179,7 +179,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, } ref.flowside = FLOW_SIDX(flow, TGTSIDE); - pingf->sock = sock_l4(c, af, flow_proto[flowtype], bind_addr, bind_if, + pingf->sock = sock_l4(c, af, EPOLL_TYPE_PING, bind_addr, bind_if, 0, ref.data); if (pingf->sock < 0) { diff --git a/passt.h b/passt.h index 21cf4c15..867e77b7 100644 --- a/passt.h +++ b/passt.h @@ -23,38 +23,6 @@ union epoll_ref; #include "tcp.h" #include "udp.h" -/** - * enum epoll_type - Different types of fds we poll over - */ -enum epoll_type { - /* Special value to indicate an invalid type */ - EPOLL_TYPE_NONE = 0, - /* Connected TCP sockets */ - EPOLL_TYPE_TCP, - /* Connected TCP sockets (spliced) */ - EPOLL_TYPE_TCP_SPLICE, - /* Listening TCP sockets */ - EPOLL_TYPE_TCP_LISTEN, - /* timerfds used for TCP timers */ - EPOLL_TYPE_TCP_TIMER, - /* UDP sockets */ - EPOLL_TYPE_UDP, - /* ICMP/ICMPv6 ping sockets */ - EPOLL_TYPE_PING, - /* inotify fd watching for end of netns (pasta) */ - EPOLL_TYPE_NSQUIT_INOTIFY, - /* timer fd watching for end of netns, fallback for inotify (pasta) */ - EPOLL_TYPE_NSQUIT_TIMER, - /* tuntap character device */ - EPOLL_TYPE_TAP_PASTA, - /* socket connected to qemu */ - EPOLL_TYPE_TAP_PASST, - /* socket listening for qemu socket connections */ - EPOLL_TYPE_TAP_LISTEN, - - EPOLL_NUM_TYPES, -}; - /** * union epoll_ref - Breakdown of reference for epoll fd bookkeeping * @type: Type of fd (tells us what to do with events) diff --git a/tcp.c b/tcp.c index 698e7ecb..a490920a 100644 --- a/tcp.c +++ b/tcp.c @@ -2467,7 +2467,7 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, }; int s; - s = sock_l4(c, af, IPPROTO_TCP, addr, ifname, port, tref.u32); + s = sock_l4(c, af, EPOLL_TYPE_TCP_LISTEN, addr, ifname, port, tref.u32); if (c->tcp.fwd_in.mode == FWD_AUTO) { if (af == AF_INET || af == AF_UNSPEC) @@ -2531,8 +2531,8 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET, IPPROTO_TCP, &in4addr_loopback, NULL, port, - tref.u32); + s = sock_l4(c, AF_INET, EPOLL_TYPE_TCP_LISTEN, &in4addr_loopback, + NULL, port, tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else @@ -2557,8 +2557,8 @@ static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET6, IPPROTO_TCP, &in6addr_loopback, NULL, port, - tref.u32); + s = sock_l4(c, AF_INET6, EPOLL_TYPE_TCP_LISTEN, &in6addr_loopback, + NULL, port, tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else diff --git a/udp.c b/udp.c index e089ef95..eadf4872 100644 --- a/udp.c +++ b/udp.c @@ -917,7 +917,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr)) bind_addr = c->ip4.addr_out; - s = sock_l4(c, AF_INET, IPPROTO_UDP, &bind_addr, + s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP, &bind_addr, bind_if, src, uref.u32); if (s < 0) return p->count - idx; @@ -972,7 +972,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, !IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) bind_addr = &c->ip6.addr_out; - s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, + s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP, bind_addr, bind_if, src, uref.u32); if (s < 0) return p->count - idx; @@ -1047,13 +1047,13 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, uref.v6 = 0; if (!ns) { - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, + r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP, addr, ifname, port, uref.u32); udp_tap_map[V4][port].sock = s < 0 ? -1 : s; udp_splice_init[V4][port].sock = s < 0 ? -1 : s; } else { - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, + r4 = s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP, &in4addr_loopback, ifname, port, uref.u32); udp_splice_ns[V4][port].sock = s < 0 ? -1 : s; @@ -1064,13 +1064,13 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, uref.v6 = 1; if (!ns) { - r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, + r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP, addr, ifname, port, uref.u32); udp_tap_map[V6][port].sock = s < 0 ? -1 : s; udp_splice_init[V6][port].sock = s < 0 ? -1 : s; } else { - r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, + r6 = s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP, &in6addr_loopback, ifname, port, uref.u32); udp_splice_ns[V6][port].sock = s < 0 ? -1 : s; diff --git a/util.c b/util.c index dd2e57f6..9a73fbb9 100644 --- a/util.c +++ b/util.c @@ -35,7 +35,7 @@ /** * sock_l4_sa() - Create and bind socket to socket address, add to epoll list * @c: Execution context - * @proto: Protocol number + * @type: epoll type * @sa: Socket address to bind to * @sl: Length of @sa * @ifname: Interface for binding, NULL for any @@ -44,34 +44,38 @@ * * Return: newly created socket, negative error code on failure */ -static int sock_l4_sa(const struct ctx *c, uint8_t proto, +static int sock_l4_sa(const struct ctx *c, enum epoll_type type, const void *sa, socklen_t sl, const char *ifname, bool v6only, uint32_t data) { sa_family_t af = ((const struct sockaddr *)sa)->sa_family; - union epoll_ref ref = { .data = data }; + union epoll_ref ref = { .type = type, .data = data }; struct epoll_event ev; int fd, y = 1, ret; + uint8_t proto; + int socktype; - switch (proto) { - case IPPROTO_TCP: - ref.type = EPOLL_TYPE_TCP_LISTEN; + switch (type) { + case EPOLL_TYPE_TCP_LISTEN: + proto = IPPROTO_TCP; + socktype = SOCK_STREAM | SOCK_NONBLOCK; break; - case IPPROTO_UDP: - ref.type = EPOLL_TYPE_UDP; + case EPOLL_TYPE_UDP: + proto = IPPROTO_UDP; + socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: - ref.type = EPOLL_TYPE_PING; + case EPOLL_TYPE_PING: + if (af == AF_INET) + proto = IPPROTO_ICMP; + else + proto = IPPROTO_ICMPV6; + socktype = SOCK_DGRAM | SOCK_NONBLOCK; break; default: - return -EPFNOSUPPORT; /* Not implemented. */ + ASSERT(0); } - if (proto == IPPROTO_TCP) - fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto); - else - fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto); + fd = socket(af, socktype, proto); ret = -errno; if (fd < 0) { @@ -118,14 +122,14 @@ static int sock_l4_sa(const struct ctx *c, uint8_t proto, * this is fine. This might also fail for ICMP because of a * broken SELinux policy, see icmp_tap_handler(). */ - if (proto != IPPROTO_ICMP && proto != IPPROTO_ICMPV6) { + if (type != EPOLL_TYPE_PING) { ret = -errno; close(fd); return ret; } } - if (proto == IPPROTO_TCP && listen(fd, 128) < 0) { + if (type == EPOLL_TYPE_TCP_LISTEN && listen(fd, 128) < 0) { ret = -errno; warn("TCP socket listen: %s", strerror(-ret)); close(fd); @@ -146,7 +150,7 @@ static int sock_l4_sa(const struct ctx *c, uint8_t proto, * sock_l4() - Create and bind socket for given L4, add to epoll list * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @proto: Protocol number + * @type: epoll type * @bind_addr: Address for binding, NULL for any * @ifname: Interface for binding, NULL for any * @port: Port, host order @@ -154,7 +158,7 @@ static int sock_l4_sa(const struct ctx *c, uint8_t proto, * * Return: newly created socket, negative error code on failure */ -int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, +int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data) { @@ -167,7 +171,7 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, }; if (bind_addr) addr4.sin_addr = *(struct in_addr *)bind_addr; - return sock_l4_sa(c, proto, &addr4, sizeof(addr4), ifname, + return sock_l4_sa(c, type, &addr4, sizeof(addr4), ifname, false, data); } @@ -188,7 +192,7 @@ int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, sizeof(c->ip6.addr_ll))) addr6.sin6_scope_id = c->ifi6; } - return sock_l4_sa(c, proto, &addr6, sizeof(addr6), ifname, + return sock_l4_sa(c, type, &addr6, sizeof(addr6), ifname, af == AF_INET6, data); } default: diff --git a/util.h b/util.h index eebb027b..d0150396 100644 --- a/util.h +++ b/util.h @@ -137,13 +137,14 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #include <limits.h> #include <stdint.h> +#include "epoll_type.h" #include "packet.h" struct ctx; /* cppcheck-suppress funcArgNamesDifferent */ __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } -int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, +int sock_l4(const struct ctx *c, sa_family_t af, enum epoll_type type, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data); void sock_probe_mem(struct ctx *c); -- 2.45.2
To implement the TCP hash table, we need an invalid (NULL-like) value for flow_sidx_t. We use FLOW_SIDX_NONE for that, but for defensiveness, we treat (usually) anything with an out of bounds flow index the same way. That's not always done consistently though. In flow_at_sidx() we open code a check on the flow index. In tcp_hash_probe() we instead compare against FLOW_SIDX_NONE, and in some other places we use the fact that flow_at_sidx() will return NULL in this case, even if we don't otherwise need the flow it returns. Clean this up a bit, by adding an explicit flow_sidx_valid() test function. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.h | 11 +++++++++++ flow_table.h | 2 +- tcp.c | 7 +++---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/flow.h b/flow.h index 29ef9f12..d1f49c65 100644 --- a/flow.h +++ b/flow.h @@ -176,6 +176,17 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t), #define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX }) +/** + * flow_sidx_valid() - Test if a sidx is valid + * @sidx: sidx value + * + * Return: true if @sidx refers to a valid flow & side + */ +static inline bool flow_sidx_valid(flow_sidx_t sidx) +{ + return sidx.flow < FLOW_MAX; +} + /** * flow_sidx_eq() - Test if two sidx values are equal * @a, @b: sidx values diff --git a/flow_table.h b/flow_table.h index 1b163491..226ddbdd 100644 --- a/flow_table.h +++ b/flow_table.h @@ -73,7 +73,7 @@ static inline unsigned flow_idx(const struct flow_common *f) */ static inline union flow *flow_at_sidx(flow_sidx_t sidx) { - if (sidx.flow >= FLOW_MAX) + if (!flow_sidx_valid(sidx)) return NULL; return FLOW(sidx.flow); } diff --git a/tcp.c b/tcp.c index a490920a..75b959a2 100644 --- a/tcp.c +++ b/tcp.c @@ -880,8 +880,7 @@ static inline unsigned tcp_hash_probe(const struct ctx *c, flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE(conn)); /* Linear probing */ - while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && - !flow_sidx_eq(tc_hash[b], sidx)) + while (flow_sidx_valid(tc_hash[b]) && !flow_sidx_eq(tc_hash[b], sidx)) b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); return b; @@ -909,9 +908,9 @@ static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn), s; - union flow *flow = flow_at_sidx(tc_hash[b]); + union flow *flow; - if (!flow) + if (!flow_sidx_valid(tc_hash[b])) return; /* Redundant remove */ flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); -- 2.45.2
udp_buf_sock_handler() takes the epoll reference from the receiving socket, and passes the UDP relevant part on to several other functions. Future changes are going to need several different epoll types for UDP, and to pass that information through to some of those functions. To avoid extra noise in the patches making the real changes, change those functions now to take the full epoll reference, rather than just the UDP part. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 63 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/udp.c b/udp.c index eadf4872..d38e0b7d 100644 --- a/udp.c +++ b/udp.c @@ -477,25 +477,26 @@ static int udp_splice_new_ns(void *arg) /** * udp_mmh_splice_port() - Is source address of message suitable for splicing? - * @uref: UDP epoll reference for incoming message's origin socket + * @ref: epoll reference for incoming message's origin socket * @mmh: mmsghdr of incoming message * * Return: if source address of message in @mmh refers to localhost (127.0.0.1 * or ::1) its source port (host order), otherwise -1. */ -static int udp_mmh_splice_port(union udp_epoll_ref uref, - const struct mmsghdr *mmh) +static int udp_mmh_splice_port(union epoll_ref ref, const struct mmsghdr *mmh) { const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name; const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name; - if (!uref.splice) + ASSERT(ref.type == EPOLL_TYPE_UDP); + + if (!ref.udp.splice) return -1; - if (uref.v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) + if (ref.udp.v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) return ntohs(sa6->sin6_port); - if (!uref.v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) + if (!ref.udp.v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) return ntohs(sa4->sin_port); return -1; @@ -507,7 +508,7 @@ static int udp_mmh_splice_port(union udp_epoll_ref uref, * @start: Index of first datagram in udp[46]_l2_buf * @n: Total number of datagrams in udp[46]_l2_buf pool * @dst: Datagrams will be sent to this port (on destination side) - * @uref: UDP epoll reference for origin socket + * @ref: epoll reference for origin socket * @now: Timestamp * * This consumes as many datagrams as are sendable via a single socket. It @@ -518,7 +519,7 @@ static int udp_mmh_splice_port(union udp_epoll_ref uref, * Return: Number of datagrams forwarded */ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, - in_port_t dst, union udp_epoll_ref uref, + in_port_t dst, union epoll_ref ref, const struct timespec *now) { in_port_t src = udp_meta[start].splicesrc; @@ -527,8 +528,9 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, int s; ASSERT(udp_meta[start].splicesrc >= 0); + ASSERT(ref.type == EPOLL_TYPE_UDP); - if (uref.v6) { + if (ref.udp.v6) { mmh_recv = udp6_l2_mh_sock; mmh_send = udp6_mh_splice; udp6_localname.sin6_port = htons(dst); @@ -544,27 +546,27 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, if (++i >= n) break; - udp_meta[i].splicesrc = udp_mmh_splice_port(uref, &mmh_recv[i]); + udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); } while (udp_meta[i].splicesrc == src); - if (uref.pif == PIF_SPLICE) { + if (ref.udp.pif == PIF_SPLICE) { src += c->udp.fwd_in.rdelta[src]; - s = udp_splice_init[uref.v6][src].sock; - if (s < 0 && uref.orig) - s = udp_splice_new(c, uref.v6, src, false); + s = udp_splice_init[ref.udp.v6][src].sock; + if (s < 0 && ref.udp.orig) + s = udp_splice_new(c, ref.udp.v6, src, false); if (s < 0) goto out; - udp_splice_ns[uref.v6][dst].ts = now->tv_sec; - udp_splice_init[uref.v6][src].ts = now->tv_sec; + udp_splice_ns[ref.udp.v6][dst].ts = now->tv_sec; + udp_splice_init[ref.udp.v6][src].ts = now->tv_sec; } else { - ASSERT(uref.pif == PIF_HOST); + ASSERT(ref.udp.pif == PIF_HOST); src += c->udp.fwd_out.rdelta[src]; - s = udp_splice_ns[uref.v6][src].sock; - if (s < 0 && uref.orig) { + s = udp_splice_ns[ref.udp.v6][src].sock; + if (s < 0 && ref.udp.orig) { struct udp_splice_new_ns_arg arg = { - c, uref.v6, src, -1, + c, ref.udp.v6, src, -1, }; NS_CALL(udp_splice_new_ns, &arg); @@ -573,8 +575,8 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, if (s < 0) goto out; - udp_splice_init[uref.v6][dst].ts = now->tv_sec; - udp_splice_ns[uref.v6][src].ts = now->tv_sec; + udp_splice_init[ref.udp.v6][dst].ts = now->tv_sec; + udp_splice_ns[ref.udp.v6][src].ts = now->tv_sec; } sendmmsg(s, mmh_send + start, i - start, MSG_NOSIGNAL); @@ -716,7 +718,7 @@ static size_t udp_update_hdr6(const struct ctx *c, * @start: Index of first datagram in udp[46]_l2_buf pool * @n: Total number of datagrams in udp[46]_l2_buf pool * @dstport: Destination port number on destination side - * @uref: UDP epoll reference for origin socket + * @ref: epoll reference for origin socket * @now: Current timestamp * * This consumes as many frames as are sendable via tap. It requires that @@ -726,7 +728,7 @@ static size_t udp_update_hdr6(const struct ctx *c, * Return: Number of frames sent via tap */ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, - in_port_t dstport, union udp_epoll_ref uref, + in_port_t dstport, union epoll_ref ref, const struct timespec *now) { struct iovec (*tap_iov)[UDP_NUM_IOVS]; @@ -734,8 +736,9 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, size_t i = start; ASSERT(udp_meta[start].splicesrc == -1); + ASSERT(ref.type == EPOLL_TYPE_UDP); - if (uref.v6) { + if (ref.udp.v6) { tap_iov = udp6_l2_iov_tap; mmh_recv = udp6_l2_mh_sock; } else { @@ -748,7 +751,7 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, struct udp_meta_t *bm = &udp_meta[i]; size_t l4len; - if (uref.v6) { + if (ref.udp.v6) { l4len = udp_update_hdr6(c, &bm->ip6h, &bm->s_in.sa6, bp, dstport, udp6_l2_mh_sock[i].msg_len, now); @@ -766,7 +769,7 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, if (++i >= n) break; - udp_meta[i].splicesrc = udp_mmh_splice_port(uref, &mmh_recv[i]); + udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); } while (udp_meta[i].splicesrc == -1); tap_send_frames(c, &tap_iov[start][0], UDP_NUM_IOVS, i - start); @@ -823,12 +826,12 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve * present). So we fill in entry 0 before the loop, then udp_*_send() * populate one entry past where they consume. */ - udp_meta[0].splicesrc = udp_mmh_splice_port(ref.udp, mmh_recv); + udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv); for (i = 0; i < n; i += m) { if (udp_meta[i].splicesrc >= 0) - m = udp_splice_send(c, i, n, dstport, ref.udp, now); + m = udp_splice_send(c, i, n, dstport, ref, now); else - m = udp_tap_send(c, i, n, dstport, ref.udp, now); + m = udp_tap_send(c, i, n, dstport, ref, now); } } -- 2.45.2
Make the salient points about these various arrays clearer with renames: * udp_l2_iov_sock and udp[46]_l2_mh_sock don't really have anything to do with L2. They are, however, specific to receiving not sending. Rename to udp_iov_recv and udp[46]_mh_recv. * udp[46]_l2_iov_tap is redundant - "tap" implies L2 and vice versa. Rename to udp[46]_l2_iov * udp[46]_localname are (for now) pre-populated with the local address but the more salient point is that these are the destination address for the splice arrays. Rename to udp[46]_splice_to Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 68 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/udp.c b/udp.c index d38e0b7d..43ed6435 100644 --- a/udp.c +++ b/udp.c @@ -229,30 +229,30 @@ enum udp_iov_idx { UDP_NUM_IOVS }; -/* recvmmsg()/sendmmsg() data for tap */ -static struct iovec udp_l2_iov_sock [UDP_MAX_FRAMES]; +/* IOVs and msghdr arrays for receiving datagrams from sockets */ +static struct iovec udp_iov_recv [UDP_MAX_FRAMES]; +static struct mmsghdr udp4_mh_recv [UDP_MAX_FRAMES]; +static struct mmsghdr udp6_mh_recv [UDP_MAX_FRAMES]; -static struct iovec udp4_l2_iov_tap [UDP_MAX_FRAMES][UDP_NUM_IOVS]; -static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES][UDP_NUM_IOVS]; - -static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; - -/* recvmmsg()/sendmmsg() data for "spliced" connections */ -static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; - -static struct sockaddr_in udp4_localname = { +/* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ +static struct sockaddr_in udp4_splice_to = { .sin_family = AF_INET, .sin_addr = IN4ADDR_LOOPBACK_INIT, }; -static struct sockaddr_in6 udp6_localname = { +static struct sockaddr_in6 udp6_splice_to = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; +static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +/* IOVs for L2 frames */ +static struct iovec udp4_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; +static struct iovec udp6_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; + + /** * udp_portmap_clear() - Clear UDP port map before configuration */ @@ -313,7 +313,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void udp_iov_init_one(const struct ctx *c, size_t i) { struct udp_payload_t *payload = &udp_payload[i]; - struct iovec *siov = &udp_l2_iov_sock[i]; + struct iovec *siov = &udp_iov_recv[i]; struct udp_meta_t *meta = &udp_meta[i]; *meta = (struct udp_meta_t) { @@ -326,8 +326,8 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); if (c->ifi4) { - struct msghdr *mh = &udp4_l2_mh_sock[i].msg_hdr; - struct iovec *tiov = udp4_l2_iov_tap[i]; + struct msghdr *mh = &udp4_mh_recv[i].msg_hdr; + struct iovec *tiov = udp4_l2_iov[i]; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in); @@ -341,8 +341,8 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) } if (c->ifi6) { - struct msghdr *mh = &udp6_l2_mh_sock[i].msg_hdr; - struct iovec *tiov = udp6_l2_iov_tap[i]; + struct msghdr *mh = &udp6_mh_recv[i].msg_hdr; + struct iovec *tiov = udp6_l2_iov[i]; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in6); @@ -531,13 +531,13 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, ASSERT(ref.type == EPOLL_TYPE_UDP); if (ref.udp.v6) { - mmh_recv = udp6_l2_mh_sock; + mmh_recv = udp6_mh_recv; mmh_send = udp6_mh_splice; - udp6_localname.sin6_port = htons(dst); + udp6_splice_to.sin6_port = htons(dst); } else { - mmh_recv = udp4_l2_mh_sock; + mmh_recv = udp4_mh_recv; mmh_send = udp4_mh_splice; - udp4_localname.sin_port = htons(dst); + udp4_splice_to.sin_port = htons(dst); } do { @@ -739,11 +739,11 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, ASSERT(ref.type == EPOLL_TYPE_UDP); if (ref.udp.v6) { - tap_iov = udp6_l2_iov_tap; - mmh_recv = udp6_l2_mh_sock; + tap_iov = udp6_l2_iov; + mmh_recv = udp6_mh_recv; } else { - mmh_recv = udp4_l2_mh_sock; - tap_iov = udp4_l2_iov_tap; + mmh_recv = udp4_mh_recv; + tap_iov = udp4_l2_iov; } do { @@ -754,13 +754,13 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, if (ref.udp.v6) { l4len = udp_update_hdr6(c, &bm->ip6h, &bm->s_in.sa6, bp, dstport, - udp6_l2_mh_sock[i].msg_len, now); + udp6_mh_recv[i].msg_len, now); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); } else { l4len = udp_update_hdr4(c, &bm->ip4h, &bm->s_in.sa4, bp, dstport, - udp4_l2_mh_sock[i].msg_len, now); + udp4_mh_recv[i].msg_len, now); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); } @@ -811,9 +811,9 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve dstport += c->udp.fwd_in.f.delta[dstport]; if (v6) - mmh_recv = udp6_l2_mh_sock; + mmh_recv = udp6_mh_recv; else - mmh_recv = udp4_l2_mh_sock; + mmh_recv = udp4_mh_recv; n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL); if (n <= 0) @@ -1097,11 +1097,11 @@ static void udp_splice_iov_init(void) struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr; struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr; - mh4->msg_name = &udp4_localname; - mh4->msg_namelen = sizeof(udp4_localname); + mh4->msg_name = &udp4_splice_to; + mh4->msg_namelen = sizeof(udp4_splice_to); - mh6->msg_name = &udp6_localname; - mh6->msg_namelen = sizeof(udp6_localname); + mh6->msg_name = &udp6_splice_to; + mh6->msg_namelen = sizeof(udp6_splice_to); udp_iov_splice[i].iov_base = udp_payload[i].data; -- 2.45.2
We have separate mmsghdr arrays for splicing IPv4 and IPv6 packets, where the only difference is that they point to different sockaddr buffers for the destination address. Unify these by having the common array point at a sockaddr_inany as the address. This does mean slightly more work when we're about to splice, because we need to write the whole socket address, rather than just the port. However it removes 32 mmsghdr structures and we're going to need more flexibility constructing that target address for the flow table. Because future changes might mean that the address isn't always loopback, change the name of the common address from *_localname to udp_splicename. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/udp.c b/udp.c index 43ed6435..2d34f6ac 100644 --- a/udp.c +++ b/udp.c @@ -235,18 +235,10 @@ static struct mmsghdr udp4_mh_recv [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_recv [UDP_MAX_FRAMES]; /* IOVs and msghdr arrays for sending "spliced" datagrams to sockets */ -static struct sockaddr_in udp4_splice_to = { - .sin_family = AF_INET, - .sin_addr = IN4ADDR_LOOPBACK_INIT, -}; -static struct sockaddr_in6 udp6_splice_to = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, -}; +static union sockaddr_inany udp_splice_to; static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; +static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; /* IOVs for L2 frames */ static struct iovec udp4_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; @@ -523,7 +515,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, const struct timespec *now) { in_port_t src = udp_meta[start].splicesrc; - struct mmsghdr *mmh_recv, *mmh_send; + struct mmsghdr *mmh_recv; unsigned int i = start; int s; @@ -532,16 +524,22 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, if (ref.udp.v6) { mmh_recv = udp6_mh_recv; - mmh_send = udp6_mh_splice; - udp6_splice_to.sin6_port = htons(dst); + udp_splice_to.sa6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = in6addr_loopback, + .sin6_port = htons(dst), + }; } else { mmh_recv = udp4_mh_recv; - mmh_send = udp4_mh_splice; - udp4_splice_to.sin_port = htons(dst); + udp_splice_to.sa4 = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr = in4addr_loopback, + .sin_port = htons(dst), + }; } do { - mmh_send[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len; + udp_mh_splice[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len; if (++i >= n) break; @@ -579,7 +577,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, udp_splice_ns[ref.udp.v6][src].ts = now->tv_sec; } - sendmmsg(s, mmh_send + start, i - start, MSG_NOSIGNAL); + sendmmsg(s, udp_mh_splice + start, i - start, MSG_NOSIGNAL); out: return i - start; } @@ -1094,20 +1092,15 @@ static void udp_splice_iov_init(void) int i; for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr; - struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr; - - mh4->msg_name = &udp4_splice_to; - mh4->msg_namelen = sizeof(udp4_splice_to); + struct msghdr *mh = &udp_mh_splice[i].msg_hdr; - mh6->msg_name = &udp6_splice_to; - mh6->msg_namelen = sizeof(udp6_splice_to); + mh->msg_name = &udp_splice_to; + mh->msg_namelen = sizeof(udp_splice_to); udp_iov_splice[i].iov_base = udp_payload[i].data; - mh4->msg_iov = &udp_iov_splice[i]; - mh6->msg_iov = &udp_iov_splice[i]; - mh4->msg_iovlen = mh6->msg_iovlen = 1; + mh->msg_iov = &udp_iov_splice[i]; + mh->msg_iovlen = 1; } } -- 2.45.2
The only differences between these arrays are that udp4_l2_iov is pre-initialised to point to the IPv4 ethernet header, and IPv4 per-frame header and udp6_l2_iov points to the IPv6 versions. We already have to set up a bunch of headers per-frame, including updating udp[46]_l2_iov[i][UDP_IOV_PAYLOAD].iov_len. It makes more sense to adjust the IOV entries to point at the correct headers for the frame than to have two complete sets of iovecs. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/udp.c b/udp.c index 2d34f6ac..8729dea2 100644 --- a/udp.c +++ b/udp.c @@ -241,8 +241,7 @@ static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES]; /* IOVs for L2 frames */ -static struct iovec udp4_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; -static struct iovec udp6_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; +static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS]; /** @@ -305,8 +304,9 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void udp_iov_init_one(const struct ctx *c, size_t i) { struct udp_payload_t *payload = &udp_payload[i]; - struct iovec *siov = &udp_iov_recv[i]; struct udp_meta_t *meta = &udp_meta[i]; + struct iovec *siov = &udp_iov_recv[i]; + struct iovec *tiov = udp_l2_iov[i]; *meta = (struct udp_meta_t) { .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP), @@ -317,34 +317,29 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP); udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); + tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); + tiov[UDP_IOV_PAYLOAD].iov_base = payload; + + /* It's useful to have separate msghdr arrays for receiving. Otherwise, + * an IPv4 recv() will alter msg_namelen, so we'd have to reset it every + * time or risk truncating the address on future IPv6 recv()s. + */ if (c->ifi4) { struct msghdr *mh = &udp4_mh_recv[i].msg_hdr; - struct iovec *tiov = udp4_l2_iov[i]; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in); mh->msg_iov = siov; mh->msg_iovlen = 1; - - tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); - tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); - tiov[UDP_IOV_IP] = IOV_OF_LVALUE(meta->ip4h); - tiov[UDP_IOV_PAYLOAD].iov_base = payload; } if (c->ifi6) { struct msghdr *mh = &udp6_mh_recv[i].msg_hdr; - struct iovec *tiov = udp6_l2_iov[i]; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in6); mh->msg_iov = siov; mh->msg_iovlen = 1; - - tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); - tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); - tiov[UDP_IOV_IP] = IOV_OF_LVALUE(meta->ip6h); - tiov[UDP_IOV_PAYLOAD].iov_base = payload; } } @@ -729,22 +724,19 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, in_port_t dstport, union epoll_ref ref, const struct timespec *now) { - struct iovec (*tap_iov)[UDP_NUM_IOVS]; struct mmsghdr *mmh_recv; size_t i = start; ASSERT(udp_meta[start].splicesrc == -1); ASSERT(ref.type == EPOLL_TYPE_UDP); - if (ref.udp.v6) { - tap_iov = udp6_l2_iov; + if (ref.udp.v6) mmh_recv = udp6_mh_recv; - } else { + else mmh_recv = udp4_mh_recv; - tap_iov = udp4_l2_iov; - } do { + struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[i]; struct udp_payload_t *bp = &udp_payload[i]; struct udp_meta_t *bm = &udp_meta[i]; size_t l4len; @@ -755,14 +747,18 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, udp6_mh_recv[i].msg_len, now); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); + (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { l4len = udp_update_hdr4(c, &bm->ip4h, &bm->s_in.sa4, bp, dstport, udp4_mh_recv[i].msg_len, now); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); + (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); } - tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len; + (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; if (++i >= n) break; @@ -770,7 +766,7 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); } while (udp_meta[i].splicesrc == -1); - tap_send_frames(c, &tap_iov[start][0], UDP_NUM_IOVS, i - start); + tap_send_frames(c, &udp_l2_iov[start][0], UDP_NUM_IOVS, i - start); return i - start; } -- 2.45.2
Since we split our packet frame buffers into different pieces, we have a single buffer per IP version for the ethernet header, rather than one per frame. This makes sense since our ethernet header is alwaus the same. However we initialise those buffers udp[46]_eth_hdr inside a per frame loop. Pull that outside the loop so we just initialise them once. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udp.c b/udp.c index 8729dea2..2d403378 100644 --- a/udp.c +++ b/udp.c @@ -314,8 +314,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i) }; *siov = IOV_OF_LVALUE(payload->data); - udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP); - udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_PAYLOAD].iov_base = payload; @@ -351,6 +349,9 @@ static void udp_iov_init(const struct ctx *c) { size_t i; + udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP); + udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < UDP_MAX_FRAMES; i++) udp_iov_init_one(c, i); } -- 2.45.2
udp_buf_sock_handler(), udp_splice_send() and udp_tap_send loosely, do four things between them: 1. Receive some datagrams from a socket 2. Split those datagrams into batches depending on how they need to be sent (via tap or via a specific splice socket) 3. Prepare buffers for each datagram to send it onwards 4. Actually send it onwards Split (1) and (3) into specific helper functions. This isn't immediately useful (udp_splice_prepare(), in particular, is trivial), but it will make further reworks clearer. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 130 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 46 deletions(-) diff --git a/udp.c b/udp.c index 2d403378..af5f23f0 100644 --- a/udp.c +++ b/udp.c @@ -490,6 +490,16 @@ static int udp_mmh_splice_port(union epoll_ref ref, const struct mmsghdr *mmh) return -1; } +/** + * udp_splice_prepare() - Prepare one datagram for splicing + * @mmh: Receiving mmsghdr array + * @idx: Index of the datagram to prepare + */ +static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) +{ + udp_mh_splice[idx].msg_hdr.msg_iov->iov_len = mmh[idx].msg_len; +} + /** * udp_splice_send() - Send datagrams from socket to socket * @c: Execution context @@ -535,7 +545,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, } do { - udp_mh_splice[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len; + udp_splice_prepare(mmh_recv, i); if (++i >= n) break; @@ -706,6 +716,42 @@ static size_t udp_update_hdr6(const struct ctx *c, return l4len; } +/** + * udp_tap_prepare() - Convert one datagram into a tap frame + * @c: Execution context + * @mmh: Receiving mmsghdr array + * @idx: Index of the datagram to prepare + * @dstport: Destination port + * @v6: Prepare for IPv6? + * @now: Current timestamp + */ +static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, + unsigned idx, in_port_t dstport, bool v6, + const struct timespec *now) +{ + struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; + struct udp_payload_t *bp = &udp_payload[idx]; + struct udp_meta_t *bm = &udp_meta[idx]; + size_t l4len; + + if (v6) { + l4len = udp_update_hdr6(c, &bm->ip6h, &bm->s_in.sa6, bp, + dstport, mmh[idx].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + + sizeof(udp6_eth_hdr)); + (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); + } else { + l4len = udp_update_hdr4(c, &bm->ip4h, &bm->s_in.sa4, bp, + dstport, mmh[idx].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + + sizeof(udp4_eth_hdr)); + (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); + (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); + } + (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; +} + /** * udp_tap_send() - Prepare UDP datagrams and send to tap interface * @c: Execution context @@ -737,29 +783,7 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, mmh_recv = udp4_mh_recv; do { - struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[i]; - struct udp_payload_t *bp = &udp_payload[i]; - struct udp_meta_t *bm = &udp_meta[i]; - size_t l4len; - - if (ref.udp.v6) { - l4len = udp_update_hdr6(c, &bm->ip6h, - &bm->s_in.sa6, bp, dstport, - udp6_mh_recv[i].msg_len, now); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + - sizeof(udp6_eth_hdr)); - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); - (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); - } else { - l4len = udp_update_hdr4(c, &bm->ip4h, - &bm->s_in.sa4, bp, dstport, - udp4_mh_recv[i].msg_len, now); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + - sizeof(udp4_eth_hdr)); - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); - (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h); - } - (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; + udp_tap_prepare(c, mmh_recv, i, dstport, ref.udp.v6, now); if (++i >= n) break; @@ -771,6 +795,39 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, return i - start; } +/** + * udp_sock_recv() - Receive datagrams from a socket + * @c: Execution context + * @s: Socket to receive from + * @events: epoll events bitmap + * @mmh mmsghdr array to receive into + * + * #syscalls recvmmsg + */ +int udp_sock_recv(const struct ctx *c, int s, uint32_t events, + struct mmsghdr *mmh) +{ + /* For not entirely clear reasons (data locality?) pasta gets better + * throughput if we receive tap datagrams one at a atime. For small + * splice datagrams throughput is slightly better if we do batch, but + * it's slightly worse for large splice datagrams. Since we don't know + * before we receive whether we'll use tap or splice, always go one at a + * time for pasta mode. + */ + int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); + + if (c->no_udp || !(events & EPOLLIN)) + return 0; + + n = recvmmsg(s, mmh, n, 0, NULL); + if (n < 0) { + err_perror("Error receiving datagrams"); + return 0; + } + + return n; +} + /** * udp_buf_sock_handler() - Handle new data from socket * @c: Execution context @@ -783,21 +840,11 @@ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - /* For not entirely clear reasons (data locality?) pasta gets - * better throughput if we receive tap datagrams one at a - * atime. For small splice datagrams throughput is slightly - * better if we do batch, but it's slightly worse for large - * splice datagrams. Since we don't know before we receive - * whether we'll use tap or splice, always go one at a time - * for pasta mode. - */ - ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); + struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; in_port_t dstport = ref.udp.port; - bool v6 = ref.udp.v6; - struct mmsghdr *mmh_recv; - int i, m; + int n, m, i; - if (c->no_udp || !(events & EPOLLIN)) + if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) return; if (ref.udp.pif == PIF_SPLICE) @@ -805,15 +852,6 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve else if (ref.udp.pif == PIF_HOST) dstport += c->udp.fwd_in.f.delta[dstport]; - if (v6) - mmh_recv = udp6_mh_recv; - else - mmh_recv = udp4_mh_recv; - - n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL); - if (n <= 0) - return; - /* We divide things into batches based on how we need to send them, * determined by udp_meta[i].splicesrc. To avoid either two passes * through the array, or recalculating splicesrc for a single entry, we -- 2.45.2
When we receive datagrams on a socket, we need to split them into batches depending on how they need to be forwarded (either via a specific splice socket, or via tap). The logic to do this, is somewhat awkwardly split between udp_buf_sock_handler() itself, udp_splice_send() and udp_tap_send(). Move all the batching logic into udp_buf_sock_handler(), leaving udp_splice_send() to just send the prepared batch. udp_tap_send() reduces to just a call to tap_send_frames() so open-code that call in udp_buf_sock_handler(). This will allow separating the batching logic from the rest of the datagram forwarding logic, which we'll need for upcoming flow table support. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 132 +++++++++++++++++++--------------------------------------- 1 file changed, 42 insertions(+), 90 deletions(-) diff --git a/udp.c b/udp.c index af5f23f0..dee402f7 100644 --- a/udp.c +++ b/udp.c @@ -501,42 +501,29 @@ static void udp_splice_prepare(struct mmsghdr *mmh, unsigned idx) } /** - * udp_splice_send() - Send datagrams from socket to socket + * udp_splice_send() - Send a batch of datagrams from socket to socket * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf - * @n: Total number of datagrams in udp[46]_l2_buf pool - * @dst: Datagrams will be sent to this port (on destination side) + * @start: Index of batch's first datagram in udp[46]_l2_buf + * @n: Number of datagrams in batch + * @src: Source port for datagram (target side) + * @dst: Destination port for datagrams (target side) * @ref: epoll reference for origin socket * @now: Timestamp - * - * This consumes as many datagrams as are sendable via a single socket. It - * requires that udp_meta[(a)start].splicesrc is initialised, and will initialise - * udp_meta[].splicesrc for each datagram it consumes *and one more* (if - * present). - * - * Return: Number of datagrams forwarded */ -static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, - in_port_t dst, union epoll_ref ref, - const struct timespec *now) +static void udp_splice_send(const struct ctx *c, size_t start, size_t n, + in_port_t src, in_port_t dst, + union epoll_ref ref, + const struct timespec *now) { - in_port_t src = udp_meta[start].splicesrc; - struct mmsghdr *mmh_recv; - unsigned int i = start; int s; - ASSERT(udp_meta[start].splicesrc >= 0); - ASSERT(ref.type == EPOLL_TYPE_UDP); - if (ref.udp.v6) { - mmh_recv = udp6_mh_recv; udp_splice_to.sa6 = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = in6addr_loopback, .sin6_port = htons(dst), }; } else { - mmh_recv = udp4_mh_recv; udp_splice_to.sa4 = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr = in4addr_loopback, @@ -544,15 +531,6 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, }; } - do { - udp_splice_prepare(mmh_recv, i); - - if (++i >= n) - break; - - udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == src); - if (ref.udp.pif == PIF_SPLICE) { src += c->udp.fwd_in.rdelta[src]; s = udp_splice_init[ref.udp.v6][src].sock; @@ -560,7 +538,7 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, s = udp_splice_new(c, ref.udp.v6, src, false); if (s < 0) - goto out; + return; udp_splice_ns[ref.udp.v6][dst].ts = now->tv_sec; udp_splice_init[ref.udp.v6][src].ts = now->tv_sec; @@ -577,15 +555,13 @@ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, s = arg.s; } if (s < 0) - goto out; + return; udp_splice_init[ref.udp.v6][dst].ts = now->tv_sec; udp_splice_ns[ref.udp.v6][src].ts = now->tv_sec; } - sendmmsg(s, udp_mh_splice + start, i - start, MSG_NOSIGNAL); -out: - return i - start; + sendmmsg(s, udp_mh_splice + start, n, MSG_NOSIGNAL); } /** @@ -725,7 +701,7 @@ static size_t udp_update_hdr6(const struct ctx *c, * @v6: Prepare for IPv6? * @now: Current timestamp */ -static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, +static void udp_tap_prepare(const struct ctx *c, const struct mmsghdr *mmh, unsigned idx, in_port_t dstport, bool v6, const struct timespec *now) { @@ -752,49 +728,6 @@ static void udp_tap_prepare(const struct ctx *c, struct mmsghdr *mmh, (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len; } -/** - * udp_tap_send() - Prepare UDP datagrams and send to tap interface - * @c: Execution context - * @start: Index of first datagram in udp[46]_l2_buf pool - * @n: Total number of datagrams in udp[46]_l2_buf pool - * @dstport: Destination port number on destination side - * @ref: epoll reference for origin socket - * @now: Current timestamp - * - * This consumes as many frames as are sendable via tap. It requires that - * udp_meta[(a)start].splicesrc is initialised, and will initialise - * udp_meta[].splicesrc for each frame it consumes *and one more* (if present). - * - * Return: Number of frames sent via tap - */ -static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, - in_port_t dstport, union epoll_ref ref, - const struct timespec *now) -{ - struct mmsghdr *mmh_recv; - size_t i = start; - - ASSERT(udp_meta[start].splicesrc == -1); - ASSERT(ref.type == EPOLL_TYPE_UDP); - - if (ref.udp.v6) - mmh_recv = udp6_mh_recv; - else - mmh_recv = udp4_mh_recv; - - do { - udp_tap_prepare(c, mmh_recv, i, dstport, ref.udp.v6, now); - - if (++i >= n) - break; - - udp_meta[i].splicesrc = udp_mmh_splice_port(ref, &mmh_recv[i]); - } while (udp_meta[i].splicesrc == -1); - - tap_send_frames(c, &udp_l2_iov[start][0], UDP_NUM_IOVS, i - start); - return i - start; -} - /** * udp_sock_recv() - Receive datagrams from a socket * @c: Execution context @@ -842,7 +775,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve { struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; in_port_t dstport = ref.udp.port; - int n, m, i; + int n, i; if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) return; @@ -852,19 +785,38 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve else if (ref.udp.pif == PIF_HOST) dstport += c->udp.fwd_in.f.delta[dstport]; - /* We divide things into batches based on how we need to send them, + /* We divide datagrams into batches based on how we need to send them, * determined by udp_meta[i].splicesrc. To avoid either two passes * through the array, or recalculating splicesrc for a single entry, we - * have to populate it one entry *ahead* of the loop counter (if - * present). So we fill in entry 0 before the loop, then udp_*_send() - * populate one entry past where they consume. + * have to populate it one entry *ahead* of the loop counter. */ udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv); - for (i = 0; i < n; i += m) { - if (udp_meta[i].splicesrc >= 0) - m = udp_splice_send(c, i, n, dstport, ref, now); - else - m = udp_tap_send(c, i, n, dstport, ref, now); + for (i = 0; i < n; ) { + int batchsrc = udp_meta[i].splicesrc; + int batchstart = i; + + do { + if (batchsrc >= 0) { + udp_splice_prepare(mmh_recv, i); + } else { + udp_tap_prepare(c, mmh_recv, i, dstport, + ref.udp.v6, now); + } + + if (++i >= n) + break; + + udp_meta[i].splicesrc = udp_mmh_splice_port(ref, + &mmh_recv[i]); + } while (udp_meta[i].splicesrc == batchsrc); + + if (batchsrc >= 0) { + udp_splice_send(c, batchstart, i - batchstart, + batchsrc, dstport, ref, now); + } else { + tap_send_frames(c, &udp_l2_iov[batchstart][0], + UDP_NUM_IOVS, i - batchstart); + } } } -- 2.45.2
For the approach we intend to use for handling UDP flows, we have some pretty specific requirements about how SO_REUSEADDR works with UDP sockets. Specifically SO_REUSEADDR allows multiple sockets with overlapping bind()s, and therefore there can be multiple sockets which are eligible to receive the same datagram. Which one will actually receive it is important to us. Add a test program which verifies things work the way we expect, which documents what those expectations are in the process. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- doc/platform-requirements/.gitignore | 1 + doc/platform-requirements/Makefile | 45 ++++ doc/platform-requirements/README | 18 ++ doc/platform-requirements/common.c | 66 +++++ doc/platform-requirements/common.h | 47 ++++ .../reuseaddr-priority.c | 240 ++++++++++++++++++ 6 files changed, 417 insertions(+) create mode 100644 doc/platform-requirements/.gitignore create mode 100644 doc/platform-requirements/Makefile create mode 100644 doc/platform-requirements/README create mode 100644 doc/platform-requirements/common.c create mode 100644 doc/platform-requirements/common.h create mode 100644 doc/platform-requirements/reuseaddr-priority.c diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore new file mode 100644 index 00000000..c1baa98e --- /dev/null +++ b/doc/platform-requirements/.gitignore @@ -0,0 +1 @@ +/reuseaddr-priority diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile new file mode 100644 index 00000000..6e1d966c --- /dev/null +++ b/doc/platform-requirements/Makefile @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright Red Hat +# Author: David Gibson <david(a)gibson.dropbear.id.au> + +TARGETS = reuseaddr-priority +SRCS = reuseaddr-priority.c +CFLAGS = -Wall + +all: cppcheck clang-tidy $(TARGETS:%=check-%) + +$(TARGETS): %: %.c common.c common.h + +check-%: % + ./$< + +cppcheck: + cppcheck --std=c11 --error-exitcode=1 --enable=all --force \ + --check-level=exhaustive \ + --inconclusive --library=posix --quiet \ + --suppress=missingIncludeSystem \ + $(SRCS) + +clang-tidy: + clang-tidy --checks=*,\ + -altera-id-dependent-backward-branch,\ + -altera-unroll-loops,\ + -bugprone-easily-swappable-parameters,\ + -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,\ + -concurrency-mt-unsafe,\ + -cppcoreguidelines-avoid-non-const-global-variables,\ + -cppcoreguidelines-init-variables,\ + -cppcoreguidelines-macro-to-enum,\ + -google-readability-braces-around-statements,\ + -hicpp-braces-around-statements,\ + -llvmlibc-restrict-system-libc-headers,\ + -misc-include-cleaner,\ + -modernize-macro-to-enum,\ + -readability-braces-around-statements,\ + -readability-identifier-length,\ + -readability-isolate-declaration \ + $(SRCS) + +clean: + rm -f $(TARGETS) *.o *~ diff --git a/doc/platform-requirements/README b/doc/platform-requirements/README new file mode 100644 index 00000000..3914d223 --- /dev/null +++ b/doc/platform-requirements/README @@ -0,0 +1,18 @@ +Platform Requirements +===================== + +TODO: document the various Linux specific features we currently require + + +Test Programs +------------- + +In some places we rely on quite specific behaviour of sockets. +Although Linux, at least, seems to behave as required, It's not always +clear from the available documentation if this is required by POSIX or +some other specification. + +To specifically document those expectations this directory has some +test programs which explicitly check for the behaviour we need. +When/if we attempt a port to a new platform, running these to check +behaviour would be a good place to start. diff --git a/doc/platform-requirements/common.c b/doc/platform-requirements/common.c new file mode 100644 index 00000000..d687377a --- /dev/null +++ b/doc/platform-requirements/common.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* common.c + * + * Common helper functions for testing SO_REUSEADDR behaviour + * + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + */ + +#include <errno.h> +#include <netinet/in.h> +#include <string.h> +#include <sys/socket.h> + +#include "common.h" + +int sock_reuseaddr(void) +{ + int y = 1; + int s; + + + s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (s < 0) + die("socket(): %s\n", strerror(errno)); + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) , 0) + die("SO_REUSEADDR: %s\n", strerror(errno)); + + return s; +} + +/* Send a token via the given connected socket */ +void send_token(int s, long token) +{ + ssize_t rc; + + rc = send(s, &token, sizeof(token), 0); + if (rc < 0) + die("send(): %s\n", strerror(errno)); + if (rc < sizeof(token)) + die("short send()\n"); +} + +/* Attempt to receive a token via the given socket. + * + * Returns true if we received the token, false if we got an EAGAIN, dies in any + * other case */ +bool recv_token(int s, long token) +{ + ssize_t rc; + long buf; + + rc = recv(s, &buf, sizeof(buf), MSG_DONTWAIT); + if (rc < 0) { + if (errno == EWOULDBLOCK) + return false; + die("recv(): %s\n", strerror(errno)); + } + if (rc < sizeof(buf)) + die("short recv()\n"); + if (buf != token) + die("data mismatch\n"); + return true; +} diff --git a/doc/platform-requirements/common.h b/doc/platform-requirements/common.h new file mode 100644 index 00000000..8844b1ed --- /dev/null +++ b/doc/platform-requirements/common.h @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* common.h + * + * Useful shared functions + * + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + */ +#ifndef REUSEADDR_COMMON_H +#define REUSEADDR_COMMON_H + +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +static inline void die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void)vfprintf(stderr, fmt, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +#if __BYTE_ORDER == __BIG_ENDIAN +#define htons_constant(x) (x) +#define htonl_constant(x) (x) +#else +#define htons_constant(x) (__bswap_constant_16(x)) +#define htonl_constant(x) (__bswap_constant_32(x)) +#endif + +#define SOCKADDR_INIT(addr, port) \ + { \ + .sin_family = AF_INET, \ + .sin_addr = { .s_addr = htonl_constant(addr) }, \ + .sin_port = htons_constant(port), \ + } + +int sock_reuseaddr(void); +void send_token(int s, long token); +bool recv_token(int s, long token); + +#endif /* REUSEADDR_COMMON_H */ diff --git a/doc/platform-requirements/reuseaddr-priority.c b/doc/platform-requirements/reuseaddr-priority.c new file mode 100644 index 00000000..644553f8 --- /dev/null +++ b/doc/platform-requirements/reuseaddr-priority.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* reuseaddr-priority.c + * + * Verify which SO_REUSEADDR UDP sockets get priority to receive + * ============================================================= + * + * SO_REUSEADDR allows multiple sockets to bind to overlapping addresses, so + * there can be multiple sockets eligible to receive the same packet. The exact + * semantics of which socket will receive in this circumstance isn't very well + * documented. + * + * This program verifies that things behave the way we expect. Specifically we + * expect: + * + * - If both a connected and an unconnected socket could receive a datagram, the + * connected one will receive it in preference to the unconnected one. + * + * - If an unconnected socket bound to a specific address and an unconnected + * socket bound to the "any" address (0.0.0.0 or ::) could receive a datagram, + * then the one with a specific address will receive it in preference to the + * other. + * + * These should be true regardless of the order the sockets are created in, or + * the order they're polled in. + * + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + */ + +#include <arpa/inet.h> +#include <errno.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define SRCPORT 13246U +#define DSTPORT 13247U + +/* Different cases for receiving socket configuration */ +enum sock_type { + /* Socket is bound to 0.0.0.0:DSTPORT and not connected */ + SOCK_BOUND_ANY = 0, + + /* Socket is bound to 127.0.0.1:DSTPORT and not connected */ + SOCK_BOUND_LO = 1, + + /* Socket is bound to 0.0.0.0:DSTPORT and connected to 127.0.0.1:SRCPORT */ + SOCK_CONNECTED = 2, + + NUM_SOCK_TYPES, +}; + +typedef enum sock_type order_t[NUM_SOCK_TYPES]; + +static order_t orders[] = { + {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}, +}; + +/* 127.0.0.2 */ +#define INADDR_LOOPBACK2 ((in_addr_t)(0x7f000002)) + +/* 0.0.0.0:DSTPORT */ +static const struct sockaddr_in any_dst = SOCKADDR_INIT(INADDR_ANY, DSTPORT); +/* 127.0.0.1:DSTPORT */ +static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT); + +/* 127.0.0.2:DSTPORT */ +static const struct sockaddr_in lo2_dst = SOCKADDR_INIT(INADDR_LOOPBACK2, DSTPORT); + +/* 127.0.0.1:SRCPORT */ +static const struct sockaddr_in lo_src = SOCKADDR_INIT(INADDR_LOOPBACK, SRCPORT); + +/* Random token to send in datagram */ +static long token; + +/* Get a socket of the specified type for receiving */ +static int sock_recv(enum sock_type type) +{ + const struct sockaddr *connect_sa = NULL; + const struct sockaddr *bind_sa = NULL; + int s; + + s = sock_reuseaddr(); + + switch (type) { + case SOCK_CONNECTED: + connect_sa = (struct sockaddr *)&lo_src; + /* fallthrough */ + case SOCK_BOUND_ANY: + bind_sa = (struct sockaddr *)&any_dst; + break; + + case SOCK_BOUND_LO: + bind_sa = (struct sockaddr *)&lo_dst; + break; + + default: + die("bug"); + } + + if (bind_sa) + if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0) + die("bind(): %s\n", strerror(errno)); + if (connect_sa) + if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0) + die("connect(): %s\n", strerror(errno)); + + return s; +} + +/* Get a socket suitable for sending to the given type of receiving socket */ +static int sock_send(enum sock_type type) +{ + const struct sockaddr *connect_sa = NULL; + const struct sockaddr *bind_sa = NULL; + int s; + + s = sock_reuseaddr(); + + switch (type) { + case SOCK_BOUND_ANY: + connect_sa = (struct sockaddr *)&lo2_dst; + break; + + case SOCK_CONNECTED: + bind_sa = (struct sockaddr *)&lo_src; + /* fallthrough */ + case SOCK_BOUND_LO: + connect_sa = (struct sockaddr *)&lo_dst; + break; + + default: + die("bug"); + } + + if (bind_sa) + if (bind(s, bind_sa, sizeof(struct sockaddr_in)) < 0) + die("bind(): %s\n", strerror(errno)); + if (connect_sa) + if (connect(s, connect_sa, sizeof(struct sockaddr_in)) < 0) + die("connect(): %s\n", strerror(errno)); + + return s; +} + +/* Check for expected behaviour with one specific ordering for various operations: + * + * @recv_create_order: Order to create receiving sockets in + * @send_create_order: Order to create sending sockets in + * @test_order: Order to test the behaviour of different types + * @recv_order: Order to check the receiving sockets + */ +static void check_one_order(const order_t recv_create_order, + const order_t send_create_order, + const order_t test_order, + const order_t recv_order) +{ + int rs[NUM_SOCK_TYPES]; + int ss[NUM_SOCK_TYPES]; + int nfds = 0; + int i, j; + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + enum sock_type t = recv_create_order[i]; + int s; + + s = sock_recv(t); + if (s >= nfds) + nfds = s + 1; + + rs[t] = s; + } + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + enum sock_type t = send_create_order[i]; + + ss[t] = sock_send(t); + } + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + enum sock_type ti = test_order[i]; + int recv_via = -1; + + send_token(ss[ti], token); + + for (j = 0; j < NUM_SOCK_TYPES; j++) { + enum sock_type tj = recv_order[j]; + + if (recv_token(rs[tj], token)) { + if (recv_via != -1) + die("Received token more than once\n"); + recv_via = tj; + } + } + + if (recv_via == -1) + die("Didn't receive token at all\n"); + if (recv_via != ti) + die("Received token via unexpected socket\n"); + } + + for (i = 0; i < NUM_SOCK_TYPES; i++) { + close(rs[i]); + close(ss[i]); + } +} + +static void check_all_orders(void) +{ + int norders = sizeof(orders) / sizeof(orders[0]); + int i, j, k, l; + + for (i = 0; i < norders; i++) + for (j = 0; j < norders; j++) + for (k = 0; k < norders; k++) + for (l = 0; l < norders; l++) + check_one_order(orders[i], orders[j], + orders[j], orders[l]); +} + +int main(int argc, char *argv[]) +{ + (void)argc; + (void)argv; + + token = random(); + + check_all_orders(); + + printf("SO_REUSEADDR receive priorities seem to work as expected\n"); + + exit(0); +} -- 2.45.2
On Fri, 05 Jul 2024, David Gibson wrote: I may be missing something subtle, but is j intended to be used twice here, rather than k?+ +static void check_all_orders(void) +{ + int norders = sizeof(orders) / sizeof(orders[0]); + int i, j, k, l; + + for (i = 0; i < norders; i++) + for (j = 0; j < norders; j++) + for (k = 0; k < norders; k++) + for (l = 0; l < norders; l++) + check_one_order(orders[i], orders[j], + orders[j], orders[l]);--------------------------------------------------------^^^^^^^^^+}-- David Taylor
On Fri, Jul 12, 2024 at 12:42:57PM +0100, David Taylor wrote:On Fri, 05 Jul 2024, David Gibson wrote: I may be missing something subtle, but is j intended to be used twice here, rather than k?Indeed not, good catch, thanks.-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson+ +static void check_all_orders(void) +{ + int norders = sizeof(orders) / sizeof(orders[0]); + int i, j, k, l; + + for (i = 0; i < norders; i++) + for (j = 0; j < norders; j++) + for (k = 0; k < norders; k++) + for (l = 0; l < norders; l++) + check_one_order(orders[i], orders[j], + orders[j], orders[l]);--------------------------------------------------------^^^^^^^^^+}
Add a test program verifying that we're able to discard datagrams from a socket without needing a big discard buffer, by using a zero length recv(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- doc/platform-requirements/.gitignore | 1 + doc/platform-requirements/Makefile | 6 +-- doc/platform-requirements/recv-zero.c | 74 +++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 doc/platform-requirements/recv-zero.c diff --git a/doc/platform-requirements/.gitignore b/doc/platform-requirements/.gitignore index c1baa98e..555031d8 100644 --- a/doc/platform-requirements/.gitignore +++ b/doc/platform-requirements/.gitignore @@ -1 +1,2 @@ /reuseaddr-priority +/recv-zero diff --git a/doc/platform-requirements/Makefile b/doc/platform-requirements/Makefile index 6e1d966c..82aaac29 100644 --- a/doc/platform-requirements/Makefile +++ b/doc/platform-requirements/Makefile @@ -3,8 +3,8 @@ # Copyright Red Hat # Author: David Gibson <david(a)gibson.dropbear.id.au> -TARGETS = reuseaddr-priority -SRCS = reuseaddr-priority.c +TARGETS = reuseaddr-priority recv-zero +SRCS = reuseaddr-priority.c recv-zero.c CFLAGS = -Wall all: cppcheck clang-tidy $(TARGETS:%=check-%) @@ -16,7 +16,7 @@ check-%: % cppcheck: cppcheck --std=c11 --error-exitcode=1 --enable=all --force \ - --check-level=exhaustive \ + --check-level=exhaustive --inline-suppr \ --inconclusive --library=posix --quiet \ --suppress=missingIncludeSystem \ $(SRCS) diff --git a/doc/platform-requirements/recv-zero.c b/doc/platform-requirements/recv-zero.c new file mode 100644 index 00000000..f161e5c2 --- /dev/null +++ b/doc/platform-requirements/recv-zero.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* recv-zero.c + * + * Verify that we're able to discard datagrams by recv()ing into a zero-length + * buffer. + * + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + */ + +#include <arpa/inet.h> +#include <errno.h> +#include <net/if.h> +#include <netinet/in.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "common.h" + +#define DSTPORT 13257U + +/* 127.0.0.1:DSTPORT */ +static const struct sockaddr_in lo_dst = SOCKADDR_INIT(INADDR_LOOPBACK, DSTPORT); + +static void test_discard(void) +{ + long token1, token2; + int recv_s, send_s; + ssize_t rc; + + token1 = random(); + token2 = random(); + + recv_s = sock_reuseaddr(); + if (bind(recv_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("bind(): %s\n", strerror(errno)); + + send_s = sock_reuseaddr(); + if (connect(send_s, (struct sockaddr *)&lo_dst, sizeof(lo_dst)) < 0) + die("connect(): %s\n", strerror(errno)); + + send_token(send_s, token1); + send_token(send_s, token2); + + /* cppcheck-suppress nullPointer */ + rc = recv(recv_s, NULL, 0, MSG_DONTWAIT); + if (rc < 0) + die("discarding recv(): %s\n", strerror(errno)); + + recv_token(recv_s, token2); + + /* cppcheck-suppress nullPointer */ + rc = recv(recv_s, NULL, 0, MSG_DONTWAIT); + if (rc < 0 && errno != EAGAIN) + die("redundant discarding recv(): %s\n", strerror(errno)); + if (rc >= 0) + die("Unexpected receive: rc=%zd\n", rc); +} + +int main(int argc, char *argv[]) +{ + (void)argc; + (void)argv; + + test_discard(); + + printf("Discarding datagrams with a 0-length recv() seems to work\n"); + + exit(0); +} -- 2.45.2
On Fri, 5 Jul 2024 20:43:58 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:The redesign of UDP flows required (or at least, suggested) a new batch of prelininary changes that don't rely on the core of the flow table rework. Changes since v1: * Assorted minor fixes based on Stefano's feedback * Moved test programs from contrib/ to doc/ David Gibson (11): util: sock_l4() determine protocol from epoll type rather than the reverse flow: Add flow_sidx_valid() helper udp: Pass full epoll reference through more of sock handler path udp: Rename IOV and mmsghdr arrays udp: Unify udp[46]_mh_splice udp: Unify udp[46]_l2_iov udp: Don't repeatedly initialise udp[46]_eth_hdr udp: Move some more of sock_handler tasks into sub-functions udp: Consolidate datagram batching doc: Add program to document and test assumptions about SO_REUSEADDR doc: Test behaviour of zero length datagram recv()sApplied. -- Stefano