tcp can use a number of different epoll event masks, and even switches between edge-triggered and level-triggered mode depending on a number of factors. This makes it very difficult to reason about exactly what events will be active when. Simplify this by instead always using event mask with all the events we ever care about (EPOLLIN, EPOLLOUT and EPOLLRDHUP). We have a number of situations where we can't immediately clear the event condition, so we must always use edge-triggered mode to avoid spinning on events. We do need to make some changes to properly handle edge-triggered mode. Specifically there are two cases where we might not be able to process all new data on an EPOLLIN, so we need to make sure we repoll the socket for queued data later. 1) If we can't forward all data to the guest, because the tap-side window is too small. To address this we repoll the socket for data when we receive an ack or window update from the guest side. 2) If we run out of buffer space in the tap device itself. We handle this by flagging connections in this state. We then recheck them at the end of each epoll cycle if the tap device is now reporting ready for more data. tcp_defer_handler() is called after the last round of epoll events is handled. Add logic here to revisit any connections which were previously blocked due to running out of buffer space on the tap device, and attempt to forward the data again. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 1 - tcp.c | 58 ++++++++++++++++++++++++++-------------------------------- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/tap.c b/tap.c index 3bdca9a1..956db8e4 100644 --- a/tap.c +++ b/tap.c @@ -1149,7 +1149,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, * Return: true if (last we knew) there's more space in the tap buffers, false * otherwise */ -/* cppcheck-suppress unusedFunction */ bool tap_is_full(void) { return tap_full_flag; diff --git a/tcp.c b/tcp.c index 78f546db..7e11f12b 100644 --- a/tcp.c +++ b/tcp.c @@ -423,34 +423,6 @@ int tcp_set_peek_offset(int s, int offset) return 0; } -/** - * tcp_conn_epoll_events() - epoll events mask for given connection state - * @events: Current connection events - * @conn_flags Connection flags - * - * Return: epoll events mask corresponding to implied connection state - */ -static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) -{ - if (!events) - return 0; - - if (events & ESTABLISHED) { - if (events & TAP_FIN_SENT) - return EPOLLET; - - if (conn_flags & STALLED) - return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; - - return EPOLLIN | EPOLLRDHUP; - } - - if (events == TAP_SYN_RCVD) - return EPOLLOUT | EPOLLET | EPOLLRDHUP; - - return EPOLLET | EPOLLRDHUP; -} - /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events * @c: Execution context @@ -463,7 +435,10 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; - struct epoll_event ev = { .data.u64 = ref.u64 }; + struct epoll_event ev = { + .events = EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET, + .data.u64 = ref.u64, + }; if (conn->events == CLOSED) { if (conn->in_epoll) @@ -473,8 +448,6 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) return 0; } - ev.events = tcp_conn_epoll_events(conn->events, conn->flags); - if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) return -errno; @@ -1746,9 +1719,13 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_rst(c, conn); return -1; } - tcp_data_from_sock(c, conn); } + /* The socket side might have queued data that we didn't send due to a + * full window. Now the window's updated, try again + */ + tcp_data_from_sock(c, conn); + if (!iov_i) goto out; @@ -2268,6 +2245,23 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { + unsigned i; + + for (i = 0; i < FLOW_MAX; i++) { + union flow *flow = FLOW(i); + + if (!num_tap_full) + break; /* No need to continue */ + if (tap_is_full()) + break; /* No point in continuing */ + + if ((flow->f.type != FLOW_TCP) || + !(flow->tcp.flags & TAP_FULL)) + continue; + + tcp_data_from_sock(c, &flow->tcp); + } + tcp_flags_flush(c); tcp_payload_flush(c); } -- 2.46.0