At present, the UDP "splice" and "tap" paths are quite separate. We have separate sockets to receive packets bound for the tap and splice paths. This leads to some code duplication, and extra open sockets. This series partially unifies the two paths, allowing us to use a single (host side) socket, bound to 0.0.0.0 or :: to receive packets for both cases. This is based on my earlier series with some fixes for the tap path. Changes since v1: * Renamed udp_localname[46] to udp[46]_localname * Allow handling of UDP port 0 * Fix a bug which could misidentify certain v6 packets as v4-spliceable * Some minor cosmetic fixes to code and commit messages David Gibson (8): udp: Move sending pasta tap frames to the end of udp_sock_handler() udp: Split sending to passt tap interface into separate function udp: Split receive from preparation and send in udp_sock_handler() udp: Receive multiple datagrams at once on the pasta sock->tap path udp: Pre-populate msg_names with local address udp: Unify udp_sock_handler_splice() with udp_sock_handler() udp: Decide whether to "splice" per datagram rather than per socket udp: Don't use separate sockets to listen for spliced packets udp.c | 380 ++++++++++++++++++++++++++++++--------------------------- udp.h | 2 +- util.h | 7 ++ 3 files changed, 205 insertions(+), 184 deletions(-) -- 2.38.1
udp_sock_handler() has a surprising difference in flow between pasta and passt mode: For pasta we send each frame to the tap interface as we prepare it. For passt, though, we prepare all the frames, then send them with a single sendmmsg(). Alter the pasta path to also prepare all the frames, then send them at the end. We already have a suitable data structure for the passt case. This will make it easier to abstract out the tap backend difference in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 61 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/udp.c b/udp.c index 4610a02..a27fe88 100644 --- a/udp.c +++ b/udp.c @@ -783,6 +783,35 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport, return buf_len; } +/** + * udp_tap_send_pasta() - Send datagrams to the pasta tap interface + * @c: Execution context + * @mmh: Array of message headers to send + * @n: Number of message headers to send + * + * #syscalls:pasta write + */ +static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh, + unsigned int n) +{ + unsigned int i, j; + + for (i = 0; i < n; i++) { + for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) { + struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j]; + + /* We can't use writev() because the tap + * character device relies on the write() + * boundaries to discern frame boundaries + */ + if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0) + debug("tap write: %s", strerror(errno)); + else + pcap(iov->iov_base, iov->iov_len); + } + } +} + /** * udp_sock_handler() - Handle new data from socket * @c: Execution context @@ -835,31 +864,25 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, else buf_len = udp_update_hdr4(c, i, dstport, now); - if (c->mode == MODE_PASTA) { - void *frame = tap_iov[i].iov_base; + tap_iov[i].iov_len = buf_len; - if (write(c->fd_tap, frame, buf_len) < 0) - debug("tap write: %s", strerror(errno)); - pcap(frame, buf_len); - } else { - tap_iov[i].iov_len = buf_len; - - /* With bigger messages, qemu closes the connection. */ - if (msg_bufs && msg_len + buf_len > SHRT_MAX) { - tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - msg_i++; - tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i]; - msg_len = msg_bufs = 0; - } - - msg_len += buf_len; - msg_bufs++; + /* With bigger messages, qemu closes the connection. */ + if (c->mode == MODE_PASST && msg_bufs && + msg_len + buf_len > SHRT_MAX) { + tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; + msg_i++; + tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i]; + msg_len = msg_bufs = 0; } + msg_len += buf_len; + msg_bufs++; } tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - if (c->mode == MODE_PASTA) + if (c->mode == MODE_PASTA) { + udp_tap_send_pasta(c, tap_mmh, msg_i + 1); return; + } ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1, MSG_NOSIGNAL | MSG_DONTWAIT); -- 2.38.1
The last part of udp_sock_handler() does the actual sending of frames to the tap interface. For pasta that's just a call to udp_tap_send_pasta() but for passt, it's moderately complex and open coded. For symmetry, move the passt send path into its own function, udp_tap_send_passt(). This will make it easier to abstract the tap interface in future (e.g. when we want to add vhost-user). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 130 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 72 insertions(+), 58 deletions(-) diff --git a/udp.c b/udp.c index a27fe88..7281bc3 100644 --- a/udp.c +++ b/udp.c @@ -812,6 +812,73 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh, } } +/** + * udp_tap_send_passt() - Send datagrams to the passt tap interface + * @c: Execution context + * @mmh: Array of message headers to send + * @n: Number of message headers to send + * + * #syscalls:passt sendmmsg sendmsg + */ +static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n) +{ + struct msghdr *last_mh; + ssize_t missing = 0; + size_t msg_len = 0; + unsigned int i; + int ret; + + ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT); + if (ret <= 0) + return; + + /* If we lose some messages to sendmmsg() here, fine, it's UDP. However, + * the last message needs to be delivered completely, otherwise qemu + * will fail to reassemble the next message and close the connection. Go + * through headers from the last sent message, counting bytes, and, if + * and as soon as we see more bytes than sendmmsg() sent, re-send the + * rest with a blocking call. + * + * In pictures, given this example: + * + * iov #0 iov #1 iov #2 iov #3 + * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ...... + * tap_mmh[ret - 1].msg_len: 7 .... ... + * + * when 'msglen' reaches: 10 ^ + * and 'missing' below is: 3 --- + * + * re-send everything from here: ^-- ----- ------ + */ + last_mh = &mmh[ret - 1].msg_hdr; + for (i = 0; i < last_mh->msg_iovlen; i++) { + if (missing <= 0) { + msg_len += last_mh->msg_iov[i].iov_len; + missing = msg_len - mmh[ret - 1].msg_len; + } + + if (missing > 0) { + uint8_t **iov_base; + int first_offset; + + iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base; + first_offset = last_mh->msg_iov[i].iov_len - missing; + *iov_base += first_offset; + last_mh->msg_iov[i].iov_len = missing; + + last_mh->msg_iov = &last_mh->msg_iov[i]; + + if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0) + debug("UDP: %li bytes to tap missing", missing); + + *iov_base -= first_offset; + break; + } + } + + pcapmm(mmh, ret); +} + /** * udp_sock_handler() - Handle new data from socket * @c: Execution context @@ -820,16 +887,14 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh, * @now: Current timestamp * * #syscalls recvmmsg - * #syscalls:passt sendmmsg sendmsg */ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { in_port_t dstport = ref.r.p.udp.udp.port; - ssize_t n, msg_len = 0, missing = 0; struct mmsghdr *tap_mmh, *sock_mmh; - int msg_bufs = 0, msg_i = 0, ret; - struct msghdr *last_mh; + int msg_bufs = 0, msg_i = 0; + ssize_t n, msg_len = 0; struct iovec *tap_iov; unsigned int i; @@ -879,61 +944,10 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, } tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - if (c->mode == MODE_PASTA) { + if (c->mode == MODE_PASTA) udp_tap_send_pasta(c, tap_mmh, msg_i + 1); - return; - } - - ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1, - MSG_NOSIGNAL | MSG_DONTWAIT); - if (ret <= 0) - return; - - /* If we lose some messages to sendmmsg() here, fine, it's UDP. However, - * the last message needs to be delivered completely, otherwise qemu - * will fail to reassemble the next message and close the connection. Go - * through headers from the last sent message, counting bytes, and, if - * and as soon as we see more bytes than sendmmsg() sent, re-send the - * rest with a blocking call. - * - * In pictures, given this example: - * - * iov #0 iov #1 iov #2 iov #3 - * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ...... - * tap_mmh[ret - 1].msg_len: 7 .... ... - * - * when 'msglen' reaches: 10 ^ - * and 'missing' below is: 3 --- - * - * re-send everything from here: ^-- ----- ------ - */ - last_mh = &tap_mmh[ret - 1].msg_hdr; - for (i = 0, msg_len = 0; i < last_mh->msg_iovlen; i++) { - if (missing <= 0) { - msg_len += last_mh->msg_iov[i].iov_len; - missing = msg_len - tap_mmh[ret - 1].msg_len; - } - - if (missing > 0) { - uint8_t **iov_base; - int first_offset; - - iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base; - first_offset = last_mh->msg_iov[i].iov_len - missing; - *iov_base += first_offset; - last_mh->msg_iov[i].iov_len = missing; - - last_mh->msg_iov = &last_mh->msg_iov[i]; - - if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0) - debug("UDP: %li bytes to tap missing", missing); - - *iov_base -= first_offset; - break; - } - } - - pcapmm(tap_mmh, ret); + else + udp_tap_send_passt(c, tap_mmh, msg_i + 1); } /** -- 2.38.1
The receive part of udp_sock_handler() and udp_sock_handler_splice() is now almost identical. In preparation for merging that, split the receive part of udp_sock_handler() from the part preparing and sending the frames for sending on the tap interface. The latter goes into a new udp_tap_send() function. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 79 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/udp.c b/udp.c index 7281bc3..64c9219 100644 --- a/udp.c +++ b/udp.c @@ -880,51 +880,39 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n) } /** - * udp_sock_handler() - Handle new data from socket + * udp_tap_send() - Prepare UDP datagrams and send to tap interface * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap + * @start: Index of first datagram in udp[46]_l2_buf pool + * @n: Number of datagrams to send + * @dstport: Destination port number + * @v6: True if using IPv6 * @now: Current timestamp * - * #syscalls recvmmsg + * Return: size of tap frame with headers */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +static void udp_tap_send(const struct ctx *c, + unsigned int start, unsigned int n, + in_port_t dstport, bool v6, const struct timespec *now) { - in_port_t dstport = ref.r.p.udp.udp.port; - struct mmsghdr *tap_mmh, *sock_mmh; int msg_bufs = 0, msg_i = 0; - ssize_t n, msg_len = 0; + struct mmsghdr *tap_mmh; struct iovec *tap_iov; + ssize_t msg_len = 0; unsigned int i; - if (events == EPOLLERR) - return; - - if (ref.r.p.udp.udp.splice) { - udp_sock_handler_splice(c, ref, events, now); - return; - } - - if (ref.r.p.udp.udp.v6) { + if (v6) { tap_mmh = udp6_l2_mh_tap; - sock_mmh = udp6_l2_mh_sock; tap_iov = udp6_l2_iov_tap; } else { tap_mmh = udp4_l2_mh_tap; - sock_mmh = udp4_l2_mh_sock; tap_iov = udp4_l2_iov_tap; } - n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL); - if (n <= 0) - return; - - tap_mmh[0].msg_hdr.msg_iov = &tap_iov[0]; - for (i = 0; i < (unsigned)n; i++) { + tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start]; + for (i = start; i < start + n; i++) { size_t buf_len; - if (ref.r.p.udp.udp.v6) + if (v6) buf_len = udp_update_hdr6(c, i, dstport, now); else buf_len = udp_update_hdr4(c, i, dstport, now); @@ -950,6 +938,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, udp_tap_send_passt(c, tap_mmh, msg_i + 1); } +/** + * udp_sock_handler() - Handle new data from socket + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * @now: Current timestamp + * + * #syscalls recvmmsg + */ +void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, + const struct timespec *now) +{ + in_port_t dstport = ref.r.p.udp.udp.port; + bool v6 = ref.r.p.udp.udp.v6; + struct mmsghdr *sock_mmh; + ssize_t n; + + if (events == EPOLLERR) + return; + + if (ref.r.p.udp.udp.splice) { + udp_sock_handler_splice(c, ref, events, now); + return; + } + + if (ref.r.p.udp.udp.v6) + sock_mmh = udp6_l2_mh_sock; + else + sock_mmh = udp4_l2_mh_sock; + + n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL); + if (n <= 0) + return; + + udp_tap_send(c, 0, n, dstport, v6, now); +} + /** * udp_tap_handler() - Handle packets from tap * @c: Execution context -- 2.38.1
Usually udp_sock_handler() will receive and forward multiple (up to 32) datagrams in udp_sock_handler(), then forward them all to the tap interface. However, when in pasta mode we will only receive and forward a single datagram at a time. Change it to receive multiple datagrams at once, like the other paths. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/udp.c b/udp.c index 64c9219..24fa984 100644 --- a/udp.c +++ b/udp.c @@ -119,7 +119,6 @@ #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ -#define UDP_TAP_FRAMES (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1) /** * struct udp_tap_port - Port tracking based on tap-facing source port @@ -968,7 +967,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, else sock_mmh = udp4_l2_mh_sock; - n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL); + n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL); if (n <= 0) return; -- 2.38.1
udp_splice_namebuf is now used only for spliced sending, and so it is only ever populated with the localhost address, either IPv4 or IPv6. So, replace the awkward initialization in udp_sock_handler_splice() with statically initialized versions for IPv4 and IPv6. We then just need to update the port number in udp_sock_handler_splice(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 40 ++++++++++++++++++---------------------- util.h | 7 +++++++ 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/udp.c b/udp.c index 24fa984..98d8687 100644 --- a/udp.c +++ b/udp.c @@ -232,11 +232,18 @@ static struct mmsghdr udp4_l2_mh_tap [UDP_MAX_FRAMES]; static struct mmsghdr udp6_l2_mh_tap [UDP_MAX_FRAMES]; /* recvmmsg()/sendmmsg() data for "spliced" connections */ -static struct sockaddr_storage udp_splice_namebuf; - static struct iovec udp4_iov_splice [UDP_MAX_FRAMES]; static struct iovec udp6_iov_splice [UDP_MAX_FRAMES]; +static struct sockaddr_in udp4_localname = { + .sin_family = AF_INET, + .sin_addr = IN4ADDR_LOOPBACK_INIT, +}; +static struct sockaddr_in6 udp6_localname = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, +}; + static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; @@ -610,23 +617,10 @@ static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref, if (n <= 0) return; - if (v6) { - *((struct sockaddr_in6 *)&udp_splice_namebuf) = - ((struct sockaddr_in6) { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - .sin6_port = htons(dst), - .sin6_scope_id = 0, - }); - } else { - *((struct sockaddr_in *)&udp_splice_namebuf) = - ((struct sockaddr_in) { - .sin_family = AF_INET, - .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) }, - .sin_port = htons(dst), - .sin_zero = { 0 }, - }); - } + if (v6) + udp6_localname.sin6_port = htons(dst); + else + udp4_localname.sin_port = htons(dst); for (i = 0; i < n; i += m) { in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name); @@ -1252,9 +1246,11 @@ static void udp_splice_iov_init(void) struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr; struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr; - mh4->msg_name = mh6->msg_name = &udp_splice_namebuf; - mh4->msg_namelen = sizeof(udp_splice_namebuf); - mh6->msg_namelen = sizeof(udp_splice_namebuf); + mh4->msg_name = &udp4_localname; + mh4->msg_namelen = sizeof(udp4_localname); + + mh6->msg_name = &udp6_localname; + mh6->msg_namelen = sizeof(udp6_localname); udp4_iov_splice[i].iov_base = udp4_l2_buf[i].data; udp6_iov_splice[i].iov_base = udp6_l2_buf[i].data; diff --git a/util.h b/util.h index 02b55a9..3baec91 100644 --- a/util.h +++ b/util.h @@ -79,6 +79,13 @@ (IN_MULTICAST(ntohl((a)->s_addr))) #define IN4_ARE_ADDR_EQUAL(a, b) \ (((struct in_addr *)(a))->s_addr == ((struct in_addr *)b)->s_addr) +#if __BYTE_ORDER == __BIG_ENDIAN +#define IN4ADDR_LOOPBACK_INIT \ + { .s_addr = INADDR_LOOPBACK } +#else +#define IN4ADDR_LOOPBACK_INIT \ + { .s_addr = __bswap_constant_32(INADDR_LOOPBACK) } +#endif #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, -- 2.38.1
These two functions now have a very similar structure, and their first part (calling recvmmsg()) is functionally identical. So, merge the two functions into one. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 86 +++++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 58 deletions(-) diff --git a/udp.c b/udp.c index 98d8687..ed31216 100644 --- a/udp.c +++ b/udp.c @@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL); } -/** - * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection - * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * @now: Current timestamp - */ -static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref, - uint32_t events, const struct timespec *now) -{ - in_port_t dst = ref.r.p.udp.udp.port; - int v6 = ref.r.p.udp.udp.v6, n, i, m; - struct mmsghdr *mmh_recv; - - if (!(events & EPOLLIN)) - return; - - if (v6) - mmh_recv = udp6_l2_mh_sock; - else - mmh_recv = udp4_l2_mh_sock; - - n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL); - - if (n <= 0) - return; - - if (v6) - udp6_localname.sin6_port = htons(dst); - else - udp4_localname.sin_port = htons(dst); - - for (i = 0; i < n; i += m) { - in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name); - - for (m = 1; i + m < n; m++) { - void *mname = mmh_recv[i + m].msg_hdr.msg_name; - if (sa_port(v6, mname) != src) - break; - } - - udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns, - ref.r.p.udp.udp.orig, now); - } -} - /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @c: Execution context @@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, { in_port_t dstport = ref.r.p.udp.udp.port; bool v6 = ref.r.p.udp.udp.v6; - struct mmsghdr *sock_mmh; + struct mmsghdr *mmh_recv; + unsigned int i, m; ssize_t n; - if (events == EPOLLERR) + if (!(events & EPOLLIN)) return; - if (ref.r.p.udp.udp.splice) { - udp_sock_handler_splice(c, ref, events, now); - return; + if (v6) { + mmh_recv = udp6_l2_mh_sock; + udp6_localname.sin6_port = htons(dstport); + } else { + mmh_recv = udp4_l2_mh_sock; + udp4_localname.sin_port = htons(dstport); } - if (ref.r.p.udp.udp.v6) - sock_mmh = udp6_l2_mh_sock; - else - sock_mmh = udp4_l2_mh_sock; - - n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL); + n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL); if (n <= 0) return; - udp_tap_send(c, 0, n, dstport, v6, now); + if (!ref.r.p.udp.udp.splice) { + udp_tap_send(c, 0, n, dstport, v6, now); + return; + } + + for (i = 0; i < n; i += m) { + in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name); + + for (m = 1; i + m < n; m++) { + void *mname = mmh_recv[i + m].msg_hdr.msg_name; + if (sa_port(v6, mname) != src) + break; + } + + udp_splice_sendfrom(c, i, m, src, dstport, v6, + ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig, + now); + } } /** -- 2.38.1
Currently we have special sockets for receiving datagrams from locahost which can use the optimized "splice" path rather than going across the tap interface. We want to loosen this so that sockets can receive sockets that will be forwarded by both the spliced and non-spliced paths. To do this, we alter the meaning of the @splice bit in the reference to mean that packets receieved on this socket *can* be spliced, not that they *will* be spliced. They'll only actually be spliced if they come from 127.0.0.1 or ::1. We can't (for now) remove the splice bit entirely, unlike with TCP. Our gateway mapping means that if the ns initiates communication to the gw address, we'll translate that to target 127.0.0.1 on the host side. Reply packets will therefore have source address 127.0.0.1 when received on the host, but these need to go via the tap path where that will be translated back to the gateway address. We need the @splice bit to distinguish that case from packets going from localhost to a port mapped explicitly with -u which should be spliced. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 52 +++++++++++++++++++++++++++++++++------------------- udp.h | 2 +- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/udp.c b/udp.c index ed31216..8ba3453 100644 --- a/udp.c +++ b/udp.c @@ -513,16 +513,25 @@ static int udp_splice_new_ns(void *arg) } /** - * sa_port() - Determine port from a sockaddr_in or sockaddr_in6 + * udp_mmh_splice_port() - Is source address of message suitable for splicing? * @v6: Is @sa a sockaddr_in6 (otherwise sockaddr_in)? - * @sa: Pointer to either sockaddr_in or sockaddr_in6 + * @mmh: mmsghdr of incoming message + * + * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from + * @sa in host order, otherwise -1. */ -static in_port_t sa_port(bool v6, const void *sa) +static int udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh) { - const struct sockaddr_in6 *sa6 = sa; - const struct sockaddr_in *sa4 = sa; + const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name; + const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name; + + if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) + return ntohs(sa6->sin6_port); + + if (!v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) + return ntohs(sa4->sin_port); - return v6 ? ntohs(sa6->sin6_port) : ntohs(sa4->sin_port); + return -1; } /** @@ -918,23 +927,28 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, if (n <= 0) return; - if (!ref.r.p.udp.udp.splice) { - udp_tap_send(c, 0, n, dstport, v6, now); - return; - } - for (i = 0; i < n; i += m) { - in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name); + int splicefrom = -1; + m = n; + + if (ref.r.p.udp.udp.splice) { + splicefrom = udp_mmh_splice_port(v6, mmh_recv + i); + + for (m = 1; i + m < n; m++) { + int p; - for (m = 1; i + m < n; m++) { - void *mname = mmh_recv[i + m].msg_hdr.msg_name; - if (sa_port(v6, mname) != src) - break; + p = udp_mmh_splice_port(v6, mmh_recv + i + m); + if (p != splicefrom) + break; + } } - udp_splice_sendfrom(c, i, m, src, dstport, v6, - ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig, - now); + if (splicefrom >= 0) + udp_splice_sendfrom(c, i, m, splicefrom, dstport, + v6, ref.r.p.udp.udp.ns, + ref.r.p.udp.udp.orig, now); + else + udp_tap_send(c, i, m, dstport, v6, now); } } diff --git a/udp.h b/udp.h index 053991e..2a03335 100644 --- a/udp.h +++ b/udp.h @@ -22,7 +22,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, /** * union udp_epoll_ref - epoll reference portion for TCP connections * @bound: Set if this file descriptor is a bound socket - * @splice: Set if descriptor is associated to "spliced" connection + * @splice: Set if descriptor packets to be "spliced" * @orig: Set if a spliced socket which can originate "connections" * @ns: Set if this is a socket in the pasta network namespace * @v6: Set for IPv6 sockets or connections -- 2.38.1
Currently, when ports are forwarded inbound in pasta mode, we open two sockets for incoming traffic: one listens on the public IP address and will forward packets to the tuntap interface. The other listens on localhost and forwards via "splicing" (resending directly via sockets in the ns). Now that we've improved the logic about whether we "splice" any individual packet, we don't need this. Instead we can have a single socket bound to 0.0.0.0 or ::, marked as able to splice and udp_sock_handler() will deal with each packet as appropriate. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 53 +++++++++++++---------------------------------------- 1 file changed, 13 insertions(+), 40 deletions(-) diff --git a/udp.c b/udp.c index 8ba3453..228a086 100644 --- a/udp.c +++ b/udp.c @@ -1116,7 +1116,6 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { union udp_epoll_ref uref = { .u32 = 0 }; - const void *bind_addr; int s; if (ns) { @@ -1128,67 +1127,41 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af, } if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { - if (!addr && c->mode == MODE_PASTA) - bind_addr = &c->ip4.addr; - else - bind_addr = addr; - uref.udp.v6 = 0; + uref.udp.splice = (c->mode == MODE_PASTA); + uref.udp.orig = true; if (!ns) { - uref.udp.splice = 0; - s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, ifname, + s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname, port, uref.u32); udp_tap_map[V4][uref.udp.port].sock = s; - - if (c->mode == MODE_PASTA) { - bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) }; - uref.udp.splice = uref.udp.orig = true; - - s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, - ifname, port, uref.u32); - udp_splice_init[V4][port].sock = s; - } + udp_splice_init[V4][port].sock = s; } else { - uref.udp.splice = uref.udp.orig = uref.udp.ns = true; - - bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) }; + struct in_addr loopback = { htonl(INADDR_LOOPBACK) }; + uref.udp.ns = true; - s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, + s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback, ifname, port, uref.u32); udp_splice_ns[V4][port].sock = s; } } if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { - if (!addr && c->mode == MODE_PASTA) - bind_addr = &c->ip6.addr; - else - bind_addr = addr; - uref.udp.v6 = 1; + uref.udp.splice = (c->mode == MODE_PASTA); + uref.udp.orig = true; if (!ns) { - uref.udp.splice = 0; - s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, ifname, + s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname, port, uref.u32); udp_tap_map[V6][uref.udp.port].sock = s; - - if (c->mode == MODE_PASTA) { - bind_addr = &in6addr_loopback; - uref.udp.splice = uref.udp.orig = true; - - s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, - ifname, port, uref.u32); - udp_splice_init[V6][port].sock = s; - } + udp_splice_init[V6][port].sock = s; } else { - bind_addr = &in6addr_loopback; - uref.udp.splice = uref.udp.orig = uref.udp.ns = true; + uref.udp.ns = true; - s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, + s = sock_l4(c, AF_INET6, IPPROTO_UDP, &in6addr_loopback, ifname, port, uref.u32); udp_splice_ns[V6][port].sock = s; } -- 2.38.1
On Thu, 15 Dec 2022 12:30:10 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:At present, the UDP "splice" and "tap" paths are quite separate. We have separate sockets to receive packets bound for the tap and splice paths. This leads to some code duplication, and extra open sockets. This series partially unifies the two paths, allowing us to use a single (host side) socket, bound to 0.0.0.0 or :: to receive packets for both cases. This is based on my earlier series with some fixes for the tap path. Changes since v1: * Renamed udp_localname[46] to udp[46]_localname * Allow handling of UDP port 0 * Fix a bug which could misidentify certain v6 packets as v4-spliceable * Some minor cosmetic fixes to code and commit messagesIt looks good to me now, thanks. I'm still trying to figure out the throughput (to tap) topic before applying -- results look pretty much the same now, but I'm not sure why, and I'd like to have a second look at perf(1) statistics. -- Stefano