A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while buffer reads doesn't lead to it being updated. At the same time, the zero window stops this side from sending out more data to trigger new advertisements to be sent from the peer. RFC 793 states that it always is permitted for a sender to send one byte of data even when the window is zero. This resolves the deadlock described above, so we choose to introduce it here as a last resort. We allow it both during fast and as keep-alives when the timer sees no activity on the connection. However, we notice that this solution doesn´t work well. Traffic sometimes goes to zero, and onley recovers after the timer has resolved the situation. Because of this, we chose to improve it slightly: The deadlock happens when a packet has been dropped at the peer end because of memory squeeze. We therefore consider it legitimate to retransmit that packet while considering the window size that was valid at the moment it was first transmitted. This works much better. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this probing. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- tcp.c | 26 ++++++++++++++++---------- tcp_conn.h | 2 ++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 95d400a..9dea151 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->max_seq_to_tap = conn->seq_to_tap; } /** @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; sendlen = len; if (!peek_offset_cap) @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) tcp_data_to_tap(c, conn, plen, no_csum, seq); seq += plen; } - + /* We need this to know this during retransmission: */ + if (SEQ_GT(seq, conn->max_seq_to_tap)) + conn->max_seq_to_tap = seq; conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ retr = !len && !th->fin && - ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + ack_seq == max_ack_seq; max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); + conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); } if (!iov_i) @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); tcp_send_flag(c, conn, ACK); } @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, tcp_tap_window_update(conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (p->count - idx == 1) return 1; @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); tcp_timer_ctl(c, conn); } } else { @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_rst(c, conn); } } + } /** @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) conn_event(c, conn, SOCK_FIN_RCVD); if (events & EPOLLIN) - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); diff --git a/tcp_conn.h b/tcp_conn.h index a5f5cfe..afcdec9 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -29,6 +29,7 @@ * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap @@ -100,6 +101,7 @@ struct tcp_tap_conn { uint16_t wnd_to_tap; uint32_t seq_to_tap; + uint32_t max_seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; -- 2.42.0