Thanks for the patch, it looks good to me and all tests pass with and
without SO_PEEK_OFF support!
Jon, Laurent, would you mind having a quick look before I apply this?
Gu, there's just one stray / trailing whitespace, indicated below, but
there's no need to send a new version for that, I will just drop it on
merge:
On Mon, 8 Sep 2025 20:04:39 +0900
"xugu@redhat.com"
From: Xun Gu
On kernels without SO_PEEK_OFF, a 16MB static buffer is used to discard sent data. This patch reduces the buffer to 1MB.
Larger discards are now handled by using multiple iovec entries pointing to the same 1MB buffer.
Signed-off-by: Xun Gu
--- tcp.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++- tcp_buf.c | 18 +++++--------- tcp_internal.h | 7 +++++- tcp_vu.c | 17 ++++--------- 4 files changed, 82 insertions(+), 26 deletions(-) diff --git a/tcp.c b/tcp.c index a27b069..253cdb3 100644 --- a/tcp.c +++ b/tcp.c @@ -399,7 +399,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-char tcp_buf_discard [MAX_WINDOW]; +char tcp_buf_discard [BUF_DISCARD_SIZE];
/* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; @@ -3766,3 +3766,67 @@ fail:
return 0; } + +/** + * tcp_prepare_iov() - Prepare iov according to kernel capability + * @msg: Message header to update + * @iov: iovec to receive TCP payload and data to discard + * @already_sent: Bytes sent after the last acknowledged one + * @payload_iov_cnt: Number of TCP payload iovec entries + * + * Return: 0 on success, -1 if already_sent cannot be discarded fully + */ +int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov, + uint32_t already_sent, int payload_iov_cnt) +{ + /* + * IOV layout + * |- tcp_buf_discard -|---------- TCP data slots ------------| + * + * with discarded data: + * |------ddddddddddddd|ttttttttttttt-------------------------| + * ^ + * | + * msg_iov + * + * without discarded data: + * |-------------------|ttttttttttttt-------------------------| + * ^ + * | + * msg_iov + * d: discard data + * t: TCP data + */ + if (peek_offset_cap) { + msg->msg_iov = iov + DISCARD_IOV_NUM; + msg->msg_iovlen = payload_iov_cnt; + } else { + int discard_cnt, discard_iov_rem; + struct iovec *iov_start; + int i; + + discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE); + if (discard_cnt > DISCARD_IOV_NUM) { + debug("Failed to discard %u already sent bytes", + already_sent); + return -1; + } + + discard_iov_rem = already_sent % BUF_DISCARD_SIZE; + + iov_start = iov + (DISCARD_IOV_NUM - discard_cnt); + + /* Multiple iov entries pointing to the same buffer */ + for (i = 0; i < discard_cnt; i++) { + iov_start[i].iov_base = tcp_buf_discard; + iov_start[i].iov_len = BUF_DISCARD_SIZE; + } + if (discard_iov_rem) + iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem; + + msg->msg_iov = iov_start; + msg->msg_iovlen = discard_cnt + payload_iov_cnt; + } + + return 0; +} diff --git a/tcp_buf.c b/tcp_buf.c index bc898de..4ebb013 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -60,7 +60,7 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; static unsigned int tcp_payload_used;
/* recvmsg()/sendmsg() data for tap */ -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; +static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -326,15 +326,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) iov_rem = (wnd_scaled - already_sent) % mss; }
- /* Prepare iov according to kernel capability */ - if (!peek_offset_cap) { - mh_sock.msg_iov = iov_sock; - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - mh_sock.msg_iovlen = fill_bufs + 1; - } else { - mh_sock.msg_iov = &iov_sock[1]; - mh_sock.msg_iovlen = fill_bufs; + if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) { + tcp_rst(c, conn); + return -1; }
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { @@ -344,12 +338,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) tcp_payload_used = 0; }
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) { iov->iov_base = &tcp_payload[tcp_payload_used + i].data; iov->iov_len = mss; } if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; + iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */ do diff --git a/tcp_internal.h b/tcp_internal.h index 9dae688..d0009f8 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -9,6 +9,9 @@ #define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS)))
+#define BUF_DISCARD_SIZE (1 << 20) ^ ...here, after the ')'
(git log/show shows it in red). -- Stefano