We discovered some problems caused by the recent merge of SO_PEEK_OFF support. 1/2 is a fix from Jon, modified by Stefano. 2/2 is a fix for a different issue. David Gibson (1): tcp: Correctly update SO_PEEK_OFF when tcp_send_frames() drops frames Jon Maloy (1): tcp: probe for SO_PEEK_OFF both in tcpv4 and tcp6 tcp.c | 37 +++++++++++++++++++++++++------------ tcp_buf.c | 23 ++++++++++++++--------- tcp_buf.h | 2 +- 3 files changed, 40 insertions(+), 22 deletions(-) -- 2.45.2
From: Jon Maloy <jmaloy(a)redhat.com> Based on an original patch by Jon Maloy: -- The recently added socket option SO_PEEK_OFF is not supported for TCP/IPv6 sockets. Until we get that support into the kernel we need to test for support in both protocols to set the global 'peek_offset_capĀ“ to true. -- Compared to the original patch: - only check for SO_PEEK_OFF support for enabled IP versions - use sa_family_t instead of int to pass the address family around Fixes: e63d281871ef ("tcp: leverage support of SO_PEEK_OFF socket option when available") Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> Message-ID: <20240722220937.3663437-1-sbrivio(a)redhat.com> Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/tcp.c b/tcp.c index 0c66ac84..c031f13e 100644 --- a/tcp.c +++ b/tcp.c @@ -2470,6 +2470,29 @@ static void tcp_sock_refill_init(const struct ctx *c) } } +/** + * tcp_probe_peek_offset_cap() - Check if SO_PEEK_OFF is supported by kernel + * @af: Address family, IPv4 or IPv6 + * + * Return: true if supported, false otherwise + */ +bool tcp_probe_peek_offset_cap(sa_family_t af) +{ + bool ret = false; + int s, optv = 0; + + s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + ret = true; + close(s); + } + + return ret; +} + /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context @@ -2478,9 +2501,6 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned int optv = 0; - int s; - ASSERT(!c->no_tcp); if (c->ifi4) @@ -2502,15 +2522,8 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } - /* Probe for SO_PEEK_OFF support */ - s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); - if (s < 0) { - warn_perror("Temporary TCP socket creation failed"); - } else { - if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) - peek_offset_cap = true; - close(s); - } + peek_offset_cap = (!c->ifi4 || tcp_probe_peek_offset_cap(AF_INET)) && + (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); return 0; -- 2.45.2
When using the new SO_PEEK_OFF feature on TCP sockets, we must adjust the SO_PEEK_OFF value whenever we move conn->seq_to_tap backwards. Although it was discussed during development, somewhere during the shuffles the case where we move the pointer backwards because we lost frames while sending them to the guest. This can happen, for example, if the socket buffer on the Unix socket to qemu overflows. Fixing this is slightly complicated because we need to pass a non-const context pointer to some places we previously didn't need it. While we're there also fix a small stylistic issue in the function comment for tcp_revert_seq() - it was using spaces instead of tabs. Fixes: e63d281871ef ("tcp: leverage support of SO_PEEK_OFF socket...") Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp_buf.c | 23 ++++++++++++++--------- tcp_buf.h | 2 +- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index 9b198984..c31e9f31 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -235,12 +235,13 @@ void tcp_flags_flush(const struct ctx *c) /** * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission - * @conns: Array of connection pointers corresponding to queued frames - * @frames: Two-dimensional array containing queued frames with sub-iovs - * @num_frames: Number of entries in the two arrays to be compared + * @ctx: Execution context + * @conns: Array of connection pointers corresponding to queued frames + * @frames: Two-dimensional array containing queued frames with sub-iovs + * @num_frames: Number of entries in the two arrays to be compared */ -static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS], - int num_frames) +static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, + struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) { int i; @@ -248,11 +249,15 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base; struct tcp_tap_conn *conn = conns[i]; uint32_t seq = ntohl(th->seq); + uint32_t peek_offset; if (SEQ_LE(conn->seq_to_tap, seq)) continue; conn->seq_to_tap = seq; + peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; + if (tcp_set_peek_offset(conn->sock, peek_offset)) + tcp_rst(c, conn); } } @@ -260,14 +265,14 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ -void tcp_payload_flush(const struct ctx *c) +void tcp_payload_flush(struct ctx *c) { size_t m; m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, tcp6_payload_used); if (m != tcp6_payload_used) { - tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m], + tcp_revert_seq(c, &tcp6_frame_conns[m], &tcp6_l2_iov[m], tcp6_payload_used - m); } tcp6_payload_used = 0; @@ -275,7 +280,7 @@ void tcp_payload_flush(const struct ctx *c) m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, tcp4_payload_used); if (m != tcp4_payload_used) { - tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m], + tcp_revert_seq(c, &tcp4_frame_conns[m], &tcp4_l2_iov[m], tcp4_payload_used - m); } tcp4_payload_used = 0; @@ -353,7 +358,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @seq: Sequence number to be sent */ -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { struct iovec *iov; diff --git a/tcp_buf.h b/tcp_buf.h index 14be7b94..3db4c56e 100644 --- a/tcp_buf.h +++ b/tcp_buf.h @@ -9,7 +9,7 @@ void tcp_sock4_iov_init(const struct ctx *c); void tcp_sock6_iov_init(const struct ctx *c); void tcp_flags_flush(const struct ctx *c); -void tcp_payload_flush(const struct ctx *c); +void tcp_payload_flush(struct ctx *c); int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); -- 2.45.2
On Wed, 24 Jul 2024 13:31:07 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:We discovered some problems caused by the recent merge of SO_PEEK_OFF support. 1/2 is a fix from Jon, modified by Stefano. 2/2 is a fix for a different issue. David Gibson (1): tcp: Correctly update SO_PEEK_OFF when tcp_send_frames() drops frames Jon Maloy (1): tcp: probe for SO_PEEK_OFF both in tcpv4 and tcp6Applied. -- Stefano