To send frames on the tap interface, the UDP uses a fairly complicated two level batching. First multiple frames are gathered into a single "message" for the qemu stream socket, then multiple messages are send with sendmmsg(). We now have tap_send_frames() which already deals with sending a number of frames, including batching and handling partial sends. Use that to considerably simplify things. This does make a couple of behavioural changes: * We used to split messages to keep them under 32kiB (except when a single frame was longer than that). The comments claim this is needed to stop qemu from closing the connection, but we don't have any equivalent logic for TCP. I wasn't able to reproduce the problem with this series, although it was apparently easy to reproduce earlier. My suspicion is that there was never an inherent need to keep messages small, however with larger messages (and default kernel buffer sizes) the chances of needing more than one resend for partial send()s is greatly increased. We used not to correctly handle that case of multiple resends, but now we do. * Previously when we got a partial send on UDP, we would resend the remainder of the entire "message", including multiple frames. The common code now only resends the remainder of a single frame, simply dropping any frames which weren't even partially sent. This is what TCP always did and is probably a better idea for UDP too. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pcap.c | 30 ------------ pcap.h | 1 - udp.c | 141 +++------------------------------------------------------ udp.h | 2 +- 4 files changed, 8 insertions(+), 166 deletions(-) diff --git a/pcap.c b/pcap.c index 7dca599..c7c6ffd 100644 --- a/pcap.c +++ b/pcap.c @@ -130,36 +130,6 @@ void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset) } } -/** - * pcapm() - Capture multiple frames from multiple message headers to pcap file - * @mmh: Pointer to first sendmmsg() header - */ -void pcapmm(const struct mmsghdr *mmh, unsigned int vlen) -{ - struct timeval tv; - unsigned int i, j; - - if (pcap_fd == -1) - return; - - gettimeofday(&tv, NULL); - - for (i = 0; i < vlen; i++) { - const struct msghdr *mh = &mmh[i].msg_hdr; - - for (j = 0; j < mh->msg_iovlen; j++) { - const struct iovec *iov = &mh->msg_iov[j]; - - if (pcap_frame((char *)iov->iov_base + 4, - iov->iov_len - 4, &tv) != 0) { - debug("Cannot log packet, length %lu", - iov->iov_len - 4); - return; - } - } - } -} - /** * pcap_init() - Initialise pcap file * @c: Execution context diff --git a/pcap.h b/pcap.h index eafc89b..c2af3cf 100644 --- a/pcap.h +++ b/pcap.h @@ -8,7 +8,6 @@ void pcap(const char *pkt, size_t len); void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset); -void pcapmm(const struct mmsghdr *mmh, unsigned int vlen); void pcap_init(struct ctx *c); #endif /* PCAP_H */ diff --git a/udp.c b/udp.c index 934d32e..adb47e8 100644 --- a/udp.c +++ b/udp.c @@ -224,9 +224,6 @@ static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES]; static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES]; static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; -static struct mmsghdr udp4_l2_mh_tap [UDP_MAX_FRAMES]; -static struct mmsghdr udp6_l2_mh_tap [UDP_MAX_FRAMES]; - /* recvmmsg()/sendmmsg() data for "spliced" connections */ static struct iovec udp4_iov_splice [UDP_MAX_FRAMES]; static struct iovec udp6_iov_splice [UDP_MAX_FRAMES]; @@ -336,12 +333,9 @@ static void udp_sock4_iov_init(const struct ctx *c) } for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct msghdr *mh = &udp4_l2_mh_tap[i].msg_hdr; struct iovec *iov = &udp4_l2_iov_tap[i]; - iov->iov_base = tap_iov_base(c, &udp4_l2_buf[i].taph); - mh->msg_iov = iov; - mh->msg_iovlen = 1; + iov->iov_base = tap_iov_base(c, &udp4_l2_buf[i].taph); } } @@ -374,12 +368,9 @@ static void udp_sock6_iov_init(const struct ctx *c) } for (i = 0; i < UDP_MAX_FRAMES; i++) { - struct msghdr *mh = &udp6_l2_mh_tap[i].msg_hdr; struct iovec *iov = &udp6_l2_iov_tap[i]; - iov->iov_base = tap_iov_base(c, &udp6_l2_buf[i].taph); - mh->msg_iov = iov; - mh->msg_iovlen = 1; + iov->iov_base = tap_iov_base(c, &udp6_l2_buf[i].taph); } } @@ -703,102 +694,6 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport, return tap_iov_len(c, &b->taph, ip_len); } -/** - * udp_tap_send_pasta() - Send datagrams to the pasta tap interface - * @c: Execution context - * @mmh: Array of message headers to send - * @n: Number of message headers to send - * - * #syscalls:pasta write - */ -static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh, - unsigned int n) -{ - unsigned int i, j; - - for (i = 0; i < n; i++) { - for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) { - struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j]; - - /* We can't use writev() because the tap - * character device relies on the write() - * boundaries to discern frame boundaries - */ - if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0) - debug("tap write: %s", strerror(errno)); - else - pcap(iov->iov_base, iov->iov_len); - } - } -} - -/** - * udp_tap_send_passt() - Send datagrams to the passt tap interface - * @c: Execution context - * @mmh: Array of message headers to send - * @n: Number of message headers to send - * - * #syscalls:passt sendmmsg sendmsg - */ -static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n) -{ - struct msghdr *last_mh; - ssize_t missing = 0; - size_t msg_len = 0; - unsigned int i; - int ret; - - ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT); - if (ret <= 0) - return; - - /* If we lose some messages to sendmmsg() here, fine, it's UDP. However, - * the last message needs to be delivered completely, otherwise qemu - * will fail to reassemble the next message and close the connection. Go - * through headers from the last sent message, counting bytes, and, if - * and as soon as we see more bytes than sendmmsg() sent, re-send the - * rest with a blocking call. - * - * In pictures, given this example: - * - * iov #0 iov #1 iov #2 iov #3 - * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ...... - * tap_mmh[ret - 1].msg_len: 7 .... ... - * - * when 'msglen' reaches: 10 ^ - * and 'missing' below is: 3 --- - * - * re-send everything from here: ^-- ----- ------ - */ - last_mh = &mmh[ret - 1].msg_hdr; - for (i = 0; i < last_mh->msg_iovlen; i++) { - if (missing <= 0) { - msg_len += last_mh->msg_iov[i].iov_len; - missing = msg_len - mmh[ret - 1].msg_len; - } - - if (missing > 0) { - uint8_t **iov_base; - int first_offset; - - iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base; - first_offset = last_mh->msg_iov[i].iov_len - missing; - *iov_base += first_offset; - last_mh->msg_iov[i].iov_len = missing; - - last_mh->msg_iov = &last_mh->msg_iov[i]; - - if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0) - debug("UDP: %li bytes to tap missing", missing); - - *iov_base -= first_offset; - break; - } - } - - pcapmm(mmh, ret); -} - /** * udp_tap_send() - Prepare UDP datagrams and send to tap interface * @c: Execution context @@ -810,25 +705,18 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n) * * Return: size of tap frame with headers */ -static void udp_tap_send(const struct ctx *c, +static void udp_tap_send(struct ctx *c, unsigned int start, unsigned int n, in_port_t dstport, bool v6, const struct timespec *now) { - int msg_bufs = 0, msg_i = 0; - struct mmsghdr *tap_mmh; struct iovec *tap_iov; - ssize_t msg_len = 0; unsigned int i; - if (v6) { - tap_mmh = udp6_l2_mh_tap; + if (v6) tap_iov = udp6_l2_iov_tap; - } else { - tap_mmh = udp4_l2_mh_tap; + else tap_iov = udp4_l2_iov_tap; - } - tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start]; for (i = start; i < start + n; i++) { size_t buf_len; @@ -838,24 +726,9 @@ static void udp_tap_send(const struct ctx *c, buf_len = udp_update_hdr4(c, i, dstport, now); tap_iov[i].iov_len = buf_len; - - /* With bigger messages, qemu closes the connection. */ - if (c->mode == MODE_PASST && msg_bufs && - msg_len + buf_len > SHRT_MAX) { - tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - msg_i++; - tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i]; - msg_len = msg_bufs = 0; - } - msg_len += buf_len; - msg_bufs++; } - tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs; - if (c->mode == MODE_PASTA) - udp_tap_send_pasta(c, tap_mmh, msg_i + 1); - else - udp_tap_send_passt(c, tap_mmh, msg_i + 1); + tap_send_frames(c, tap_iov + start, n); } /** @@ -867,7 +740,7 @@ static void udp_tap_send(const struct ctx *c, * * #syscalls recvmmsg */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { /* For not entirely clear reasons (data locality?) pasta gets diff --git a/udp.h b/udp.h index 2a03335..68082ea 100644 --- a/udp.h +++ b/udp.h @@ -8,7 +8,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, int af, const void *addr, const struct pool *p, const struct timespec *now); -- 2.39.0