In the course of investigating bug 68, I discovered a number of pretty serious bugs in how we handle various cases in tcp_tap_handler() and tcp_data_from_tap(). This series fixes a number of them. Note that while I'm pretty sure the bugs fixed here are real, I haven't yet positively traced how they lead to the symptoms in bug 68 - I'm still waiting on the results from some special instrumentation to track that down. Link: https://bugs.passt.top/show_bug.cgi?id=68 David Gibson (8): tcp, tap: Correctly advance through packets in tcp_tap_handler() udp, tap: Correctly advance through packets in udp_tap_handler() tcp: Remove some redundant packet_get() operations tcp: Never hash match closed connections tcp: Return consumed packet count from tcp_data_from_tap() tcp: Correctly handle RST followed rapidly by SYN tcp: Consolidate paths where we initiate reset on tap interface tcp: Correct handling of FIN,ACK followed by SYN tap.c | 29 ++++++++++-------- tcp.c | 98 +++++++++++++++++++++++++++++++---------------------------- tcp.h | 2 +- udp.c | 15 ++++----- udp.h | 2 +- 5 files changed, 78 insertions(+), 68 deletions(-) -- 2.41.0
In both tap4_handler() and tap6_handler(), once we've sorted incoming l3 packets into "sequences", we then step through all the packets in each TCP sequence calling tcp_tap_handler(). Or so it appears. In fact, tcp_tap_handler() doesn't take an index and always looks at packet 0 of the sequence, except when it calls tcp_data_from_tap() to process data packets. It appears to be written with the idea that the struct pool is a queue, from which it consumes packets as it processes them, but that's not how the pool data structure works - they are more like an array of packets. We only get away with this, because setup packets for TCP tend to come in separate batches (because we need to reply in between) and so we only get a bunch of packets for the same connection together when they're data packets (tcp_data_from_tap() has its own loop through packets). Correct this by adding an index parameter to tcp_tap_handler() and altering the loops in tap.c to step through the pool properly. Link: https://bugs.passt.top/show_bug.cgi?id=68 Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 25 +++++++++++++++++-------- tcp.c | 28 +++++++++++++++------------- tcp.h | 2 +- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/tap.c b/tap.c index 8d7859c..445a5ca 100644 --- a/tap.c +++ b/tap.c @@ -707,16 +707,20 @@ append: for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { struct pool *p = (struct pool *)&seq->p; - size_t n = p->count; - tap_packet_debug(NULL, NULL, seq, 0, NULL, n); + tap_packet_debug(NULL, NULL, seq, 0, NULL, p->count); if (seq->protocol == IPPROTO_TCP) { + size_t k; + if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET, &seq->saddr, - &seq->daddr, p, now))); + for (k = 0; k < p->count; ) + k += tcp_tap_handler(c, AF_INET, &seq->saddr, + &seq->daddr, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { + size_t n = p->count; + if (c->no_udp) continue; while ((n -= udp_tap_handler(c, AF_INET, &seq->saddr, @@ -868,16 +872,21 @@ append: for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { struct pool *p = (struct pool *)&seq->p; - size_t n = p->count; - tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n); + tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, + p->count); if (seq->protocol == IPPROTO_TCP) { + size_t k; + if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET6, &seq->saddr, - &seq->daddr, p, now))); + for (k = 0; k < p->count; ) + k += tcp_tap_handler(c, AF_INET6, &seq->saddr, + &seq->daddr, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { + size_t n = p->count; + if (c->no_udp) continue; while ((n -= udp_tap_handler(c, AF_INET6, &seq->saddr, diff --git a/tcp.c b/tcp.c index c89e6e4..d8c2327 100644 --- a/tcp.c +++ b/tcp.c @@ -2294,11 +2294,12 @@ err: * @c: Execution context * @conn: Connection pointer * @p: Pool of TCP packets, with TCP headers + * @idx: Index of first data packet in pool * * #syscalls sendmsg */ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, - const struct pool *p) + const struct pool *p, int idx) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; @@ -2313,7 +2314,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, ASSERT(conn->events & ESTABLISHED); - for (i = 0, iov_i = 0; i < (int)p->count; i++) { + for (i = idx, iov_i = 0; i < (int)p->count; i++) { uint32_t seq, seq_offset, ack_seq; struct tcphdr *th; char *data; @@ -2530,12 +2531,13 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * @saddr: Source address * @daddr: Destination address * @p: Pool of TCP packets, with TCP headers + * @idx: Index of first packet in pool to process * @now: Current timestamp * * Return: count of consumed packets */ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, - const struct pool *p, const struct timespec *now) + const struct pool *p, int idx, const struct timespec *now) { struct tcp_tap_conn *conn; size_t optlen, len; @@ -2543,17 +2545,17 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, int ack_due = 0; char *opts; - if (!packet_get(p, 0, 0, 0, &len)) + if (!packet_get(p, idx, 0, 0, &len)) return 1; - th = packet_get(p, 0, 0, sizeof(*th), NULL); + th = packet_get(p, idx, 0, sizeof(*th), NULL); if (!th) return 1; optlen = th->doff * 4UL - sizeof(*th); /* Static checkers might fail to see this: */ optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); - opts = packet_get(p, 0, sizeof(*th), optlen, NULL); + opts = packet_get(p, idx, sizeof(*th), optlen, NULL); conn = tcp_hash_lookup(c, af, daddr, htons(th->source), htons(th->dest)); @@ -2569,7 +2571,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, if (th->rst) { conn_event(c, conn, CLOSED); - return p->count; + return p->count - idx; } if (th->ack && !(conn->events & ESTABLISHED)) @@ -2591,7 +2593,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, if (conn->events & TAP_SYN_RCVD) { if (!(conn->events & TAP_SYN_ACK_SENT)) { tcp_rst(c, conn); - return p->count; + return p->count - idx; } conn_event(c, conn, ESTABLISHED); @@ -2603,19 +2605,19 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); - return p->count; + return p->count - idx; } if (!th->ack) { tcp_rst(c, conn); - return p->count; + return p->count - idx; } tcp_clamp_window(c, conn, ntohs(th->window)); tcp_data_from_sock(c, conn); - if (p->count == 1) + if (p->count - idx == 1) return 1; } @@ -2631,7 +2633,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, } /* Established connections accepting data from tap */ - tcp_data_from_tap(c, conn, p); + tcp_data_from_tap(c, conn, p, idx); if (conn->seq_ack_to_tap != conn->seq_from_tap) ack_due = 1; @@ -2645,7 +2647,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, if (ack_due) conn_flag(c, conn, ACK_TO_TAP_DUE); - return p->count; + return p->count - idx; } /** diff --git a/tcp.h b/tcp.h index 9eaec3f..6444d6a 100644 --- a/tcp.h +++ b/tcp.h @@ -18,7 +18,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, const struct timespec *now); void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, - const struct pool *p, const struct timespec *now); + const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); -- 2.41.0
In both tap4_handler() and tap6_handler(), once we've sorted incoming l3 packets into "sequences", we then step through all the packets in each DUP sequence calling udp_tap_handler(). Or so it appears. In fact, udp_tap_handler() doesn't take an index and always starts with packet 0 of the sequence, even if called repeatedly. It appears to be written with the idea that the struct pool is a queue, from which it consumes packets as it processes them, but that's not how the pool data structure works. Correct this by adding an index parameter to udp_tap_handler() and altering the loops in tap.c to step through the pool properly. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 20 ++++++++------------ udp.c | 15 ++++++++------- udp.h | 2 +- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/tap.c b/tap.c index 445a5ca..93db989 100644 --- a/tap.c +++ b/tap.c @@ -707,24 +707,22 @@ append: for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { struct pool *p = (struct pool *)&seq->p; + size_t k; tap_packet_debug(NULL, NULL, seq, 0, NULL, p->count); if (seq->protocol == IPPROTO_TCP) { - size_t k; - if (c->no_tcp) continue; for (k = 0; k < p->count; ) k += tcp_tap_handler(c, AF_INET, &seq->saddr, &seq->daddr, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { - size_t n = p->count; - if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET, &seq->saddr, - &seq->daddr, p, now))); + for (k = 0; k < p->count; ) + k += udp_tap_handler(c, AF_INET, &seq->saddr, + &seq->daddr, p, k, now); } } @@ -872,25 +870,23 @@ append: for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { struct pool *p = (struct pool *)&seq->p; + size_t k; tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, p->count); if (seq->protocol == IPPROTO_TCP) { - size_t k; - if (c->no_tcp) continue; for (k = 0; k < p->count; ) k += tcp_tap_handler(c, AF_INET6, &seq->saddr, &seq->daddr, p, k, now); } else if (seq->protocol == IPPROTO_UDP) { - size_t n = p->count; - if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET6, &seq->saddr, - &seq->daddr, p, now))); + for (k = 0; k < p->count; ) + k += udp_tap_handler(c, AF_INET6, &seq->saddr, + &seq->daddr, p, k, now); } } diff --git a/udp.c b/udp.c index f4ed660..ed1b7a5 100644 --- a/udp.c +++ b/udp.c @@ -789,6 +789,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * @saddr: Source address * @daddr: Destination address * @p: Pool of UDP packets, with UDP headers + * @idx: Index of first packet to process * @now: Current timestamp * * Return: count of consumed packets @@ -796,7 +797,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * #syscalls sendmmsg */ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, - const struct pool *p, const struct timespec *now) + const struct pool *p, int idx, const struct timespec *now) { struct mmsghdr mm[UIO_MAXIOV]; struct iovec m[UIO_MAXIOV]; @@ -811,7 +812,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, (void)c; (void)saddr; - uh = packet_get(p, 0, 0, sizeof(*uh), NULL); + uh = packet_get(p, idx, 0, sizeof(*uh), NULL); if (!uh) return 1; @@ -859,7 +860,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, s = sock_l4(c, AF_INET, IPPROTO_UDP, &bind_addr, bind_if, src, uref.u32); if (s < 0) - return p->count; + return p->count - idx; udp_tap_map[V4][src].sock = s; bitmap_set(udp_act[V4][UDP_ACT_TAP], src); @@ -909,7 +910,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, bind_if, src, uref.u32); if (s < 0) - return p->count; + return p->count - idx; udp_tap_map[V6][src].sock = s; bitmap_set(udp_act[V6][UDP_ACT_TAP], src); @@ -918,13 +919,13 @@ int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, udp_tap_map[V6][src].ts = now->tv_sec; } - for (i = 0; i < (int)p->count; i++) { + for (i = 0; i < (int)p->count - idx; i++) { struct udphdr *uh_send; size_t len; - uh_send = packet_get(p, i, 0, sizeof(*uh), &len); + uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len); if (!uh_send) - return p->count; + return p->count - idx; mm[i].msg_hdr.msg_name = sa; mm[i].msg_hdr.msg_namelen = sl; diff --git a/udp.h b/udp.h index f553de2..0ee0695 100644 --- a/udp.h +++ b/udp.h @@ -11,7 +11,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, - const struct pool *p, const struct timespec *now); + const struct pool *p, int idx, const struct timespec *now); int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); -- 2.41.0
Both tcp_data_from_tap() and tcp_tap_handler() call packet_get() to get the entire L4 packet length, then immediately call it again to check that the packet is long enough to include a TCP header. The features of packet_get() let us easily combine these together, we just need to adjust the length slightly, because we want the value to include the TCP header length. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index d8c2327..6a34f82 100644 --- a/tcp.c +++ b/tcp.c @@ -2320,16 +2320,12 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, char *data; size_t off; - if (!packet_get(p, i, 0, 0, &len)) { - tcp_rst(c, conn); - return; - } - - th = packet_get(p, i, 0, sizeof(*th), NULL); + th = packet_get(p, i, 0, sizeof(*th), &len); if (!th) { tcp_rst(c, conn); return; } + len += sizeof(*th); off = th->doff * 4UL; if (off < sizeof(*th) || off > len) { @@ -2545,12 +2541,10 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, int ack_due = 0; char *opts; - if (!packet_get(p, idx, 0, 0, &len)) - return 1; - - th = packet_get(p, idx, 0, sizeof(*th), NULL); + th = packet_get(p, idx, 0, sizeof(*th), &len); if (!th) return 1; + len += sizeof(*th); optlen = th->doff * 4UL - sizeof(*th); /* Static checkers might fail to see this: */ -- 2.41.0
From a practical point of view, when a TCP connection ends, whether by FIN or by RST, we set the CLOSED event, then some time later we remove the connection from the hash table and clean it up. However, from a protocol point of view, once it's closed, it's gone, and any new packets with matching addresses and ports are either forming a new connection, or are invalid packets to discard. Enforce these semantics in the TCP hash logic by never hash matching closed connections. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 6a34f82..5592998 100644 --- a/tcp.c +++ b/tcp.c @@ -1146,7 +1146,7 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, const union inany_addr *faddr, in_port_t eport, in_port_t fport) { - if (inany_equals(&conn->faddr, faddr) && + if (conn->events != CLOSED && inany_equals(&conn->faddr, faddr) && conn->eport == eport && conn->fport == fport) return 1; -- 2.41.0
Currently tcp_data_from_tap() is assumed to consume all packets remaining in the packet pool it is given. However there are some edge cases where that's not correct. In preparation for fixing those, change it to return a count of packets consumed and use that in its caller. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 5592998..34c27f0 100644 --- a/tcp.c +++ b/tcp.c @@ -2297,8 +2297,10 @@ err: * @idx: Index of first data packet in pool * * #syscalls sendmsg + * + * Return: count of consumed packets */ -static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, +static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, const struct pool *p, int idx) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; @@ -2310,7 +2312,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, ssize_t n; if (conn->events == CLOSED) - return; + return p->count - idx; ASSERT(conn->events & ESTABLISHED); @@ -2323,19 +2325,19 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, th = packet_get(p, i, 0, sizeof(*th), &len); if (!th) { tcp_rst(c, conn); - return; + return p->count - idx; } len += sizeof(*th); off = th->doff * 4UL; if (off < sizeof(*th) || off > len) { tcp_rst(c, conn); - return; + return p->count - idx; } if (th->rst) { conn_event(c, conn, CLOSED); - return; + return p->count - idx; } len -= off; @@ -2446,10 +2448,10 @@ eintr: if (errno == EAGAIN || errno == EWOULDBLOCK) { tcp_send_flag(c, conn, ACK_IF_NEEDED); - return; + return p->count - idx; } tcp_rst(c, conn); - return; + return p->count - idx; } if (n < (int)(seq_from_tap - conn->seq_from_tap)) { @@ -2470,7 +2472,7 @@ out: conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff; tcp_send_flag(c, conn, DUP_ACK); } - return; + return p->count - idx; } if (ack && conn->events & TAP_FIN_SENT && @@ -2484,6 +2486,8 @@ out: } else { tcp_send_flag(c, conn, ACK_IF_NEEDED); } + + return p->count - idx; } /** @@ -2540,6 +2544,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, struct tcphdr *th; int ack_due = 0; char *opts; + int count; th = packet_get(p, idx, 0, sizeof(*th), &len); if (!th) @@ -2627,7 +2632,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, } /* Established connections accepting data from tap */ - tcp_data_from_tap(c, conn, p, idx); + count = tcp_data_from_tap(c, conn, p, idx); if (conn->seq_ack_to_tap != conn->seq_from_tap) ack_due = 1; @@ -2641,7 +2646,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, if (ack_due) conn_flag(c, conn, ACK_TO_TAP_DUE); - return p->count - idx; + return count; } /** -- 2.41.0
Although it's unlikely in practice, the guest could theoretically reset one TCP connection then immediately start a new one with the same addressses and ports, such that we get an RST then a SYN in the same batch of received packets in tcp_tap_handler(). We don't correctly handle that unlikely case, because when we receive the RST, we discard any remaining packets in the batch so we'd never see the SYN. This could happen in either tcp_tap_handler() or tcp_data_from_tap(). Correct that by returning 1, so that the caller will continue calling tcp_tap_handler() on subsequent packets allowing us to process any subsequent SYN. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 34c27f0..a1b5a72 100644 --- a/tcp.c +++ b/tcp.c @@ -2337,7 +2337,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, if (th->rst) { conn_event(c, conn, CLOSED); - return p->count - idx; + return 1; } len -= off; @@ -2570,7 +2570,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, if (th->rst) { conn_event(c, conn, CLOSED); - return p->count - idx; + return 1; } if (th->ack && !(conn->events & ESTABLISHED)) -- 2.41.0
There are a number of conditions where we will issue a TCP RST in response to something unexpected we received from the tap interface. These occur in both tcp_data_from_tap() and tcp_tap_handler(). In tcp_tap_handler() use a 'goto out of line' technique to consolidate all these paths into one place. For the tcp_data_from_tap() cases use a negative return code and direct that to the same path in tcp_tap_handler(), its caller. In this case we want to discard all remaining packets in the batch we have received: even if they're otherwise good, they'll be invalidated when the guest receives the RST we're sending. This is subtly different from the case where we *receive* an RST, where we could in theory get a new SYN immediately afterwards. Clarify that with a common on the now common reset path. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/tcp.c b/tcp.c index a1b5a72..c76df73 100644 --- a/tcp.c +++ b/tcp.c @@ -2323,17 +2323,13 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, size_t off; th = packet_get(p, i, 0, sizeof(*th), &len); - if (!th) { - tcp_rst(c, conn); - return p->count - idx; - } + if (!th) + return -1; len += sizeof(*th); off = th->doff * 4UL; - if (off < sizeof(*th) || off > len) { - tcp_rst(c, conn); - return p->count - idx; - } + if (off < sizeof(*th) || off > len) + return -1; if (th->rst) { conn_event(c, conn, CLOSED); @@ -2449,9 +2445,9 @@ eintr: if (errno == EAGAIN || errno == EWOULDBLOCK) { tcp_send_flag(c, conn, ACK_IF_NEEDED); return p->count - idx; + } - tcp_rst(c, conn); - return p->count - idx; + return -1; } if (n < (int)(seq_from_tap - conn->seq_from_tap)) { @@ -2580,20 +2576,18 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, /* Establishing connection from socket */ if (conn->events & SOCK_ACCEPTED) { - if (th->syn && th->ack && !th->fin) + if (th->syn && th->ack && !th->fin) { tcp_conn_from_sock_finish(c, conn, th, opts, optlen); - else - tcp_rst(c, conn); + return 1; + } - return 1; + goto reset; } /* Establishing connection from tap */ if (conn->events & TAP_SYN_RCVD) { - if (!(conn->events & TAP_SYN_ACK_SENT)) { - tcp_rst(c, conn); - return p->count - idx; - } + if (!(conn->events & TAP_SYN_ACK_SENT)) + goto reset; conn_event(c, conn, ESTABLISHED); @@ -2607,10 +2601,8 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, return p->count - idx; } - if (!th->ack) { - tcp_rst(c, conn); - return p->count - idx; - } + if (!th->ack) + goto reset; tcp_clamp_window(c, conn, ntohs(th->window)); @@ -2633,6 +2625,9 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, /* Established connections accepting data from tap */ count = tcp_data_from_tap(c, conn, p, idx); + if (count == -1) + goto reset; + if (conn->seq_ack_to_tap != conn->seq_from_tap) ack_due = 1; @@ -2647,6 +2642,14 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, conn_flag(c, conn, ACK_TO_TAP_DUE); return count; + +reset: + /* Something's gone wrong, so reset the connection. We discard + * remaining packets in the batch, since they'd be invalidated when our + * RST is received, even if otherwise good. + */ + tcp_rst(c, conn); + return p->count - idx; } /** -- 2.41.0
When the guest tries to establish a connection, it could give up on it by sending a FIN,ACK instead of a plain ACK to our SYN,ACK. It could then make a new attempt to establish a connection with the same addresses and ports with a new SYN. Although it's unlikely, it could send the 2nd SYN very shortly after the FIN,ACK resulting in both being received in the same batch of packets from the tap interface. Currently, we don't handle that correctly, when we receive a FIN,ACK on a not fully established connection we discard the remaining packets in the batch, and so will never process the 2nd SYN. Correct this by returning 1 from tcp_tap_handler() in this case, so we'll just consume the FIN,ACK and continue to process the rest of the batch. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index c76df73..dd3142d 100644 --- a/tcp.c +++ b/tcp.c @@ -2598,7 +2598,7 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); - return p->count - idx; + return 1; } if (!th->ack) -- 2.41.0
On Fri, 8 Sep 2023 11:49:45 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:In the course of investigating bug 68, I discovered a number of pretty serious bugs in how we handle various cases in tcp_tap_handler() and tcp_data_from_tap(). This series fixes a number of them. Note that while I'm pretty sure the bugs fixed here are real, I haven't yet positively traced how they lead to the symptoms in bug 68 - I'm still waiting on the results from some special instrumentation to track that down. Link: https://bugs.passt.top/show_bug.cgi?id=68 David Gibson (8): tcp, tap: Correctly advance through packets in tcp_tap_handler() udp, tap: Correctly advance through packets in udp_tap_handler() tcp: Remove some redundant packet_get() operations tcp: Never hash match closed connections tcp: Return consumed packet count from tcp_data_from_tap() tcp: Correctly handle RST followed rapidly by SYN tcp: Consolidate paths where we initiate reset on tap interface tcp: Correct handling of FIN,ACK followed by SYNSeries applied, thanks. -- Stefano