First commit introduces support for SO_PEEK_OFF socket option. Second commit adds a workaround for a kernel TCP bug. Jon Maloy (2): tcp: leverage support of SO_PEEK_OFF socket option when available tcp: allow retransmit when peer receive window is zero tcp.c | 63 +++++++++++++++++++++++++++++++++++++++++++----------- tcp_conn.h | 2 ++ 2 files changed, 53 insertions(+), 12 deletions(-) -- 2.42.0
The kernel may support recvmsg(MSG_PEEK), starting reading data from a given offset set by the SO_PEEK_OFF socket option. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- tcp.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 905d26f..95d400a 100644 --- a/tcp.c +++ b/tcp.c @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ +static bool peek_offset_cap = false; static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +static void set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + perror("Failed to set SO_PEEK_OFF\n"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) goto cancel; } - + set_peek_offset(s, 0); conn = &flow->tcp; conn->f.type = FLOW_TCP; conn->sock = s; @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */ + set_peek_offset(s, already_sent); + } + /* Receive into buffers, don't dequeue until acknowledged by guest. */ do len = recvmsg(s, &mh_sock, MSG_PEEK); @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s, (struct sockaddr *)&sa)) return; + set_peek_offset(s, 0); tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) int tcp_init(struct ctx *c) { unsigned b; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s < 0) { + perror("Temporary tcp socket creation failed\n"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { + peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not "); return 0; } -- 2.42.0
On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:The kernel may support recvmsg(MSG_PEEK), starting reading data from a given offset set by the SO_PEEK_OFF socket option.It would be practical to mention in commit message and in a code comment which kernel commit introduced the feature, that is, your commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option") -- even if it's on net-next, better than nothing (net-next might be rebased but it's quite rare). Also the (forecast, at this point) kernel version where this will be introduced would be nice to have.This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- tcp.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 905d26f..95d400a 100644 --- a/tcp.c +++ b/tcp.c @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ +static bool peek_offset_cap = false;No need to initialise, it's static. The comment just above referred to tcp_buf_discard and iov_sock ("data"), now you're adding a flag just under it, which is a bit confusing. I would rather leave the original comment for tcp_buf_discard and iov_sock, and add another one, say...:static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1];/* Does the kernel support TCP_PEEK_OFF? */ static bool peek_offset_cap;@@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +static void set_peek_offset(int s, int offset)A kerneldoc-style function comment would be nice, like all other functions in this file: /** * set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported * @s Socket * @offset Offset in bytes */+{ + if (!peek_offset_cap) + return; + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))gcc ('make') says: In function ‘set_peek_offset’, inlined from ‘tcp_listen_handler’ at tcp.c:2819:2: tcp.c:592:13: warning: ‘s’ may be used uninitialized [-Wmaybe-uninitialized] 592 | if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tcp.c: In function ‘tcp_listen_handler’: tcp.c:2815:13: note: ‘s’ was declared here 2815 | int s; | ^ clang-tidy ('make clang-tidy'): /home/sbrivio/passt/tcp.c:2819:2: error: 1st function call argument is an uninitialized value [clang-analyzer-core.CallAndMessage,-warnings-as-errors] set_peek_offset(s, 0); ^ ~ /home/sbrivio/passt/tcp.c:2815:2: note: 's' declared without an initial value int s; ^~~~~ /home/sbrivio/passt/tcp.c:2817:6: note: Assuming field 'no_tcp' is 0 if (c->no_tcp || !(flow = flow_alloc())) ^~~~~~~~~ /home/sbrivio/passt/tcp.c:2817:6: note: Left side of '||' is false /home/sbrivio/passt/tcp.c:2817:21: note: Assuming 'flow' is non-null if (c->no_tcp || !(flow = flow_alloc())) ^~~~ /home/sbrivio/passt/tcp.c:2817:2: note: Taking false branch if (c->no_tcp || !(flow = flow_alloc())) ^ /home/sbrivio/passt/tcp.c:2819:2: note: 1st function call argument is an uninitialized value set_peek_offset(s, 0); ^ ~ /home/sbrivio/passt/tcp.c:2819:18: error: variable 's' is uninitialized when used here [clang-diagnostic-uninitialized,-warnings-as-errors] set_peek_offset(s, 0); ^ /home/sbrivio/passt/tcp.c:2815:7: note: initialize the variable 's' to silence this warning int s; ^ = 0 and cppcheck ('make cppcheck'): tcp.c:2819:18: error: Uninitialized variable: s [uninitvar] set_peek_offset(s, 0); ^ tcp.c:2817:16: note: Assuming condition is false if (c->no_tcp || !(flow = flow_alloc())) ^ tcp.c:2819:18: note: Uninitialized variable: s set_peek_offset(s, 0); ^ make: *** [Makefile:296: cppcheck] Error 1+ perror("Failed to set SO_PEEK_OFF\n"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) goto cancel; } -Spurious change.+ set_peek_offset(s, 0);Do we really need to initialise it to zero on a new connection? Extra system calls on this path matter for latency of connection establishment.conn = &flow->tcp; conn->f.type = FLOW_TCP; conn->sock = s; @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */While Wiktionary lists "synch" as "Alternative spelling of sync", I would argue that "sync" is much more common and less surprising.+ set_peek_offset(s, already_sent);Note that there's an early return before this point where already_sent is set to 0. Don't you need to set_peek_offset() also in that case? I guess this would be easier to follow if both assignments of already_sent, earlier in this function, were followed by a set_peek_offset() call. Or do we risk calling it spuriously? > + } > + > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > do > len = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, > s, (struct sockaddr *)&sa)) > return;+ set_peek_offset(s, 0);Same here, do we really need to initialise it to zero? If yes, it would be nice to leave a blank line after that 'return' as it was before.tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) int tcp_init(struct ctx *c) { unsigned b; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);By default, we should always pass SOCK_CLOEXEC to socket(), just in case we miss closing one.+ if (s < 0) { + perror("Temporary tcp socket creation failed\n");This should be a warn() call, and s/tcp/TCP/.+ } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) {No particular reason to exceed 80 columns here, and curly brackets aren't needed (coding style is the same as kernel). For consistency with the rest of the codebase: &((int) { 0 }).+ peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");This should be a debug() call, a bit too much for info().return 0; }...I'm still reviewing 2/2, sorry for the delay. -- Stefano
On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:[snip]Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1. We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEK or b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets). -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ set_peek_offset(s, 0);Do we really need to initialise it to zero on a new connection? Extra system calls on this path matter for latency of connection establishment.
On Wed, 24 Apr 2024 10:48:05 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:By the way of which, this is not documented at this point -- a man page patch (linux-man and linux-api lists) would be nice.On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:[snip]Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1.+ set_peek_offset(s, 0);Do we really need to initialise it to zero on a new connection? Extra system calls on this path matter for latency of connection establishment.We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEKI was actually thinking that we could set it only as we receive data (not every connection will receive data), and keep this out of the handshake (which we want to keep "faster", I think). And setting it as we mark a connection as ESTABLISHED should have the same effect on latency as setting it on a new connection -- that's not really lazy. So, actually:or b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets)....this could be done as we receive data in tcp_data_from_sock(), with a new flag in tcp_tap_conn::flags, to avoid adding latency to the handshake. It also looks more robust to me, and done/checked in a single place where we need it. We have just three bits left there which isn't great, but if we need to save one at a later point, we can drop this new flag easily. -- Stefano
On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote:On Wed, 24 Apr 2024 10:48:05 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:That makes sense, but I think it would need a per-connection flag.On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:By the way of which, this is not documented at this point -- a man page patch (linux-man and linux-api lists) would be nice.On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:[snip]Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1.+ set_peek_offset(s, 0);Do we really need to initialise it to zero on a new connection? Extra system calls on this path matter for latency of connection establishment.We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEKI was actually thinking that we could set it only as we receive data (not every connection will receive data), and keep this out of the handshake (which we want to keep "faster", I think).And setting it as we mark a connection as ESTABLISHED should have the same effect on latency as setting it on a new connection -- that's not really lazy. So, actually:Good point.I just realised that folding the feature detection into this is a bit costlier than I thought. If we globally probe the feature we just need one bit per connection: is SO_PEEK_OFF set yet or not. If we tried to probe per-connection we'd need a tristate: haven't tried / SO_PEEK_OFF enabled / tried and failed. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonor b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets)....this could be done as we receive data in tcp_data_from_sock(), with a new flag in tcp_tap_conn::flags, to avoid adding latency to the handshake. It also looks more robust to me, and done/checked in a single place where we need it. We have just three bits left there which isn't great, but if we need to save one at a later point, we can drop this new flag easily.
On Fri, 26 Apr 2024 13:27:11 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote:Definitely.On Wed, 24 Apr 2024 10:48:05 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:That makes sense, but I think it would need a per-connection flag.On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:By the way of which, this is not documented at this point -- a man page patch (linux-man and linux-api lists) would be nice.On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:[snip]> + set_peek_offset(s, 0); Do we really need to initialise it to zero on a new connection? Extra system calls on this path matter for latency of connection establishment.Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1.We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEKI was actually thinking that we could set it only as we receive data (not every connection will receive data), and keep this out of the handshake (which we want to keep "faster", I think).I forgot to mention this part: what I wanted to propose was actually still a global probe, so that we don't waste one system call per connection on kernels not supporting this (a substantial use case for a couple of years from now?), which probably outweighs the advantage of the weird, purely theoretical kernel not supporting the feature for some sockets only. And then something like PEEK_OFFSET_SET (SO_PEEK_OFF_SET sounds awkward to me) on top. Another advantage is avoiding the tristate you described. -- StefanoAnd setting it as we mark a connection as ESTABLISHED should have the same effect on latency as setting it on a new connection -- that's not really lazy. So, actually:Good point.I just realised that folding the feature detection into this is a bit costlier than I thought. If we globally probe the feature we just need one bit per connection: is SO_PEEK_OFF set yet or not. If we tried to probe per-connection we'd need a tristate: haven't tried / SO_PEEK_OFF enabled / tried and failed.or b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets)....this could be done as we receive data in tcp_data_from_sock(), with a new flag in tcp_tap_conn::flags, to avoid adding latency to the handshake. It also looks more robust to me, and done/checked in a single place where we need it. We have just three bits left there which isn't great, but if we need to save one at a later point, we can drop this new flag easily.
On Fri, Apr 26, 2024 at 07:58:32AM +0200, Stefano Brivio wrote:On Fri, 26 Apr 2024 13:27:11 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote:Definitely.On Wed, 24 Apr 2024 10:48:05 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:That makes sense, but I think it would need a per-connection flag.On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: > On Sat, 20 Apr 2024 15:19:19 -0400 > Jon Maloy <jmaloy(a)redhat.com> wrote: [snip] > > + set_peek_offset(s, 0); > > Do we really need to initialise it to zero on a new connection? Extra > system calls on this path matter for latency of connection > establishment. Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1.By the way of which, this is not documented at this point -- a man page patch (linux-man and linux-api lists) would be nice.We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEKI was actually thinking that we could set it only as we receive data (not every connection will receive data), and keep this out of the handshake (which we want to keep "faster", I think).I forgot to mention this part: what I wanted to propose was actually still a global probe, so that we don't waste one system call per connection on kernels not supporting this (a substantial use case for a couple of years from now?), which probably outweighs the advantage of the weird, purely theoretical kernel not supporting the feature for some sockets only.And setting it as we mark a connection as ESTABLISHED should have the same effect on latency as setting it on a new connection -- that's not really lazy. So, actually:Good point.I just realised that folding the feature detection into this is a bit costlier than I thought. If we globally probe the feature we just need one bit per connection: is SO_PEEK_OFF set yet or not. If we tried to probe per-connection we'd need a tristate: haven't tried / SO_PEEK_OFF enabled / tried and failed.or b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets)....this could be done as we receive data in tcp_data_from_sock(), with a new flag in tcp_tap_conn::flags, to avoid adding latency to the handshake. It also looks more robust to me, and done/checked in a single place where we need it. We have just three bits left there which isn't great, but if we need to save one at a later point, we can drop this new flag easily.And then something like PEEK_OFFSET_SET (SO_PEEK_OFF_SET sounds awkward to me) on top. Another advantage is avoiding the tristate you described.Right, having thought it through I agree this is a better approach. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On 2024-04-23 20:48, David Gibson wrote:On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote:Yes. Although I moved it to tcp_conn_from_tap() in the next version, to make it more symmetric with the one in tcp_tap_conn_from_sock()On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy(a)redhat.com> wrote:[snip] >> + set_peek_offset(s, 0); > Do we really need to initialise it to zero on a new connection? Extra > system calls on this path matter for latency of connection > establishment.Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1. We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEK or b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets).
On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote:The kernel may support recvmsg(MSG_PEEK), starting reading data from aNot worth a respin on its own, but I think the comma above is misplaced, and for me makes the sentence much harder to read.given offset set by the SO_PEEK_OFF socket option. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- tcp.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 905d26f..95d400a 100644 --- a/tcp.c +++ b/tcp.c @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ +static bool peek_offset_cap = false; static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +static void set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + perror("Failed to set SO_PEEK_OFF\n"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) goto cancel; } - + set_peek_offset(s, 0); conn = &flow->tcp; conn->f.type = FLOW_TCP; conn->sock = s; @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */ + set_peek_offset(s, already_sent);I thought we didn't need to set SO_PEEK_OFF here - that it would track on its own, and we only needed to change it for retransmits. I don't think we even need to calculate 'already_sent' when we have SO_PEEK_OFF. In fact - if we set already_sent to 0 here, it might make things a bit cleaner than having to have special cases for adjusting the iov and sendlen.+ } + /* Receive into buffers, don't dequeue until acknowledged by guest. */ do len = recvmsg(s, &mh_sock, MSG_PEEK); @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s, (struct sockaddr *)&sa)) return; + set_peek_offset(s, 0); tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) int tcp_init(struct ctx *c) { unsigned b; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s < 0) { + perror("Temporary tcp socket creation failed\n"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { + peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ");Should be an info().return 0; }-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On 2024-04-23 20:44, David Gibson wrote:On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote:[...]The kernel may support recvmsg(MSG_PEEK), starting reading data from aNot worth a respin on its own, but I think the comma above is misplaced, and for me makes the sentence much harder to read. > given offset set by the SO_PEEK_OFF socket option. This makes it > possible to avoid repeated reading of already read initial bytes of a > received message, hence saving read cycles when forwarding TCP messages > in the host->name space direction. > > In this commit, we add functionality to leverage this feature when available,In theory yes. I tried it for a while, using SEQ_GE(max_ack_seq, ack_seq) as criteria for retransmission. I observed some strange behavior, like retransmits that seemingly did not come from fast retransmit or timer retransmit, and that the kernel 'sk_peek_off' didn´t always have the expected value when comparing with 'already_sent´. Since my focus was on the zero-window issue I decided to skip this for now and take the safe option. I may revisit this later.@@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */ + set_peek_offset(s, already_sent);I thought we didn't need to set SO_PEEK_OFF here - that it would track on its own, and we only needed to change it for retransmits. I don't think we even need to calculate 'already_sent' when we have SO_PEEK_OFF. In fact - if we set already_sent to 0 here, it might make things a bit cleaner than having to have special cases for adjusting the iov and sendlen.> + } > + > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > do > len = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } >[...]Made it a debug() as suggested by Stefano.+ peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ")Should be an info().> return 0; > } >
On Thu, Apr 25, 2024 at 07:23:28PM -0400, Jon Maloy wrote:On 2024-04-23 20:44, David Gibson wrote:Ouch, that sounds bad. I'm pretty sure that means there's a bug on one side or the other.On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote:[...]The kernel may support recvmsg(MSG_PEEK), starting reading data from aNot worth a respin on its own, but I think the comma above is misplaced, and for me makes the sentence much harder to read. > given offset set by the SO_PEEK_OFF socket option. This makes it > possible to avoid repeated reading of already read initial bytes of a > received message, hence saving read cycles when forwarding TCP messages > in the host->name space direction. > > In this commit, we add functionality to leverage this feature when available,In theory yes. I tried it for a while, using SEQ_GE(max_ack_seq, ack_seq) as criteria for retransmission. I observed some strange behavior, like retransmits that seemingly did not come from fast retransmit or timer retransmit, and that the kernel 'sk_peek_off' didn´t always have the expected value when comparing with 'already_sent´.@@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */ + set_peek_offset(s, already_sent);I thought we didn't need to set SO_PEEK_OFF here - that it would track on its own, and we only needed to change it for retransmits. I don't think we even need to calculate 'already_sent' when we have SO_PEEK_OFF. In fact - if we set already_sent to 0 here, it might make things a bit cleaner than having to have special cases for adjusting the iov and sendlen.Since my focus was on the zero-window issue I decided to skip this for now and take the safe option. I may revisit this later.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson> + } > + > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > do > len = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > }[...]Made it a debug() as suggested by Stefano.+ peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ")Should be an info().
A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while buffer reads doesn't lead to it being updated. At the same time, the zero window stops this side from sending out more data to trigger new advertisements to be sent from the peer. RFC 793 states that it always is permitted for a sender to send one byte of data even when the window is zero. This resolves the deadlock described above, so we choose to introduce it here as a last resort. We allow it both during fast and as keep-alives when the timer sees no activity on the connection. However, we notice that this solution doesn´t work well. Traffic sometimes goes to zero, and onley recovers after the timer has resolved the situation. Because of this, we chose to improve it slightly: The deadlock happens when a packet has been dropped at the peer end because of memory squeeze. We therefore consider it legitimate to retransmit that packet while considering the window size that was valid at the moment it was first transmitted. This works much better. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this probing. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- tcp.c | 26 ++++++++++++++++---------- tcp_conn.h | 2 ++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 95d400a..9dea151 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->max_seq_to_tap = conn->seq_to_tap; } /** @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; sendlen = len; if (!peek_offset_cap) @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) tcp_data_to_tap(c, conn, plen, no_csum, seq); seq += plen; } - + /* We need this to know this during retransmission: */ + if (SEQ_GT(seq, conn->max_seq_to_tap)) + conn->max_seq_to_tap = seq; conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ retr = !len && !th->fin && - ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + ack_seq == max_ack_seq; max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); + conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); } if (!iov_i) @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); tcp_send_flag(c, conn, ACK); } @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, tcp_tap_window_update(conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (p->count - idx == 1) return 1; @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); tcp_timer_ctl(c, conn); } } else { @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_rst(c, conn); } } + } /** @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) conn_event(c, conn, SOCK_FIN_RCVD); if (events & EPOLLIN) - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); diff --git a/tcp_conn.h b/tcp_conn.h index a5f5cfe..afcdec9 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -29,6 +29,7 @@ * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap @@ -100,6 +101,7 @@ struct tcp_tap_conn { uint16_t wnd_to_tap; uint32_t seq_to_tap; + uint32_t max_seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; -- 2.42.0
On Sat, Apr 20, 2024 at 03:19:20PM -0400, Jon Maloy wrote:A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while buffer reads doesn't lead to it being updated. At the same time, the zero window stops this side from sending out more data to trigger new advertisements to be sent from the peer.This is a bit confusing to me. I guess the buffer reads must be happening on the peer, not "this side", but that's not very clear. I think "received from the peer" might be better than "sent from the peer" to make it more obvious that the point of view is uniformly from "this side".RFC 793 states that it always is permitted for a sender to send one byte of data even when the window is zero. This resolves the deadlock described above, so we choose to introduce it here as a last resort. We allow it both during fast and as keep-alives when the timer sees no activity on the connection.Looks like there's a missing noun after "during fast".However, we notice that this solution doesn´t work well. Traffic sometimes goes to zero, and onley recovers after the timer has resolved the situation.s/onley/only/Because of this, we chose to improve it slightly: The deadlock happens when aIs it actually useful to describe the "bad" workaround if we're using a different one? I don't see how the better workaround is an obvious extension of the bad one.packet has been dropped at the peer end because of memory squeeze. We therefore consider it legitimate to retransmit that packet while considering the window size that was valid at the moment it was first transmitted. This works much better. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this probing. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- tcp.c | 26 ++++++++++++++++---------- tcp_conn.h | 2 ++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 95d400a..9dea151 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->max_seq_to_tap = conn->seq_to_tap; } /** @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; sendlen = len; if (!peek_offset_cap)Looks like some duplicated lines from a bad rebase.@@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) tcp_data_to_tap(c, conn, plen, no_csum, seq); seq += plen; } - + /* We need this to know this during retransmission: */ + if (SEQ_GT(seq, conn->max_seq_to_tap)) + conn->max_seq_to_tap = seq; conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ retr = !len && !th->fin && - ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + ack_seq == max_ack_seq; max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); + conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));Does the MAX do anything: if max_seq_to_tap equals seq_ack_from_tap then, by definition, there is nothing to retransmit - everything has been acked.} if (!iov_i) @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); tcp_send_flag(c, conn, ACK); } @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, tcp_tap_window_update(conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (p->count - idx == 1) return 1; @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); tcp_timer_ctl(c, conn); } } else { @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_rst(c, conn); } } +Spurious change} /** @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) conn_event(c, conn, SOCK_FIN_RCVD); if (events & EPOLLIN) - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); diff --git a/tcp_conn.h b/tcp_conn.h index a5f5cfe..afcdec9 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -29,6 +29,7 @@ * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap @@ -100,6 +101,7 @@ struct tcp_tap_conn { uint16_t wnd_to_tap; uint32_t seq_to_tap; + uint32_t max_seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap;-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On top of David's comments: On Wed, 24 Apr 2024 11:04:51 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Sat, Apr 20, 2024 at 03:19:20PM -0400, Jon Maloy wrote:As incredible as it might sound, RFC 793 is now obsolete :) -- you should refer to RFC 9293 instead. Further, RFC 9293 3.8.6.1 (just like RFC 793) says that we *must* (albeit not MUST) "regularly" send that octet -- as long as we have one available. Not that it's permitted. Hence, a general comment on this patch: my understanding is that we want to implement zero-window probing, instead of triggering a fast re-transmit whenever we get a keep-alive packet from the peer, because we don't know if we'll get one. RFC 9293 3.8.6.1 goes on saying: The transmitting host SHOULD send the first zero-window probe when a zero window has existed for the retransmission timeout period (SHLD-29) (Section 3.8.1), and SHOULD increase exponentially the interval between successive probes (SHLD-30). but we currently don't compute the retransmission timeout according to RFC 6298 (yeah, I know...). Given that it's a SHOULD, and complying with that is clearly beyond the scope of this change, we can just use one of the existing timeouts and timers (ACK_TIMEOUT would do, for the moment).A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while buffer reads doesn't lead to it being updated. At the same time, the zero window stops this side from sending out more data to trigger new advertisements to be sent from the peer.This is a bit confusing to me. I guess the buffer reads must be happening on the peer, not "this side", but that's not very clear. I think "received from the peer" might be better than "sent from the peer" to make it more obvious that the point of view is uniformly from "this side". > RFC 793 states that it always is permitted for a sender to send one byte > of data even when the window is zero.This exceeds 80 columns for no particularly good reason, and the function comment should be updated.This resolves the deadlock described above, so we choose to introduce it here as a last resort. We allow it both during fast and as keep-alives when the timer sees no activity on the connection.Looks like there's a missing noun after "during fast".However, we notice that this solution doesn´t work well. Traffic sometimes goes to zero, and onley recovers after the timer has resolved the situation.s/onley/only/Because of this, we chose to improve it slightly: The deadlock happens when aIs it actually useful to describe the "bad" workaround if we're using a different one? I don't see how the better workaround is an obvious extension of the bad one. > packet has been dropped at the peer end because of memory squeeze. We therefore > consider it legitimate to retransmit that packet while considering the window > size that was valid at the moment it was first transmitted. This works > much better. > > It should be noted that although this solves the problem we have at hand, > it is not a genuine solution to the kernel bug. There may well be TCP stacks > around in other OS-es which don't do this probing. > > Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> > --- > tcp.c | 26 ++++++++++++++++---------- > tcp_conn.h | 2 ++ > 2 files changed, 18 insertions(+), 10 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 95d400a..9dea151 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, > ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; > > conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; > + conn->max_seq_to_tap = conn->seq_to_tap; > } > > /** > @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > * > * #syscalls recvmsg > */ > -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled)s/this to know this/to know this/ (I guess){ - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; sendlen = len; if (!peek_offset_cap)Looks like some duplicated lines from a bad rebase. > @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > tcp_data_to_tap(c, conn, plen, no_csum, seq); > seq += plen; > } > - > + /* We need this to know this during retransmission: */> + if (SEQ_GT(seq, conn->max_seq_to_tap)) > + conn->max_seq_to_tap = seq; > conn_flag(c, conn, ACK_FROM_TAP_DUE); > > return 0; > @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > SEQ_GE(ack_seq, max_ack_seq)) { > /* Fast re-transmit */ > retr = !len && !th->fin && > - ack_seq == max_ack_seq && > - ntohs(th->window) == max_ack_seq_wnd; > + ack_seq == max_ack_seq;This matches any keep-alive (that is, a segment without data and without a window update) we get: I think you should replace the existing condition with a check that the window is zero, instead.> > max_ack_seq_wnd = ntohs(th->window); > max_ack_seq = ack_seq; > @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > flow_trace(conn, > "fast re-transmit, ACK: %u, previous sequence: %u", > max_ack_seq, conn->seq_to_tap); > +Spurious change.Sequence numbers regularly wrap around (they are 32 bits), so you can't really define this. I'm not entirely sure how you use it, though -- I have the same question about the usage of MAX() above.conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap));Does the MAX do anything: if max_seq_to_tap equals seq_ack_from_tap then, by definition, there is nothing to retransmit - everything has been acked.} if (!iov_i) @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); tcp_send_flag(c, conn, ACK); } @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, tcp_tap_window_update(conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (p->count - idx == 1) return 1; @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); tcp_timer_ctl(c, conn); } } else { @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_rst(c, conn); } } +Spurious change > } > > /** > @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) > conn_event(c, conn, SOCK_FIN_RCVD); > > if (events & EPOLLIN) > - tcp_data_from_sock(c, conn); > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > > if (events & EPOLLOUT) > tcp_update_seqack_wnd(c, conn, 0, NULL); > diff --git a/tcp_conn.h b/tcp_conn.h > index a5f5cfe..afcdec9 100644 > --- a/tcp_conn.h > +++ b/tcp_conn.h > @@ -29,6 +29,7 @@ > * @wnd_from_tap: Last window size from tap, unscaled (as received) > * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) > * @seq_to_tap: Next sequence for packets to tap > + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit-- Stefano* @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap @@ -100,6 +101,7 @@ struct tcp_tap_conn { uint16_t wnd_to_tap; uint32_t seq_to_tap; + uint32_t max_seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap;