Since 4684f603446b ("tap: Don't use EPOLLET on Qemu sockets") we've only used level-triggered events for the tap device. Prior to that we used it inconsistently which was confusing (though not incorrect AFAICT). We want to add support for EPOLLOUT events on the tap connection, and without EPOLLET that would require toggling EPOLLOUT on and off, which is awkward. So, re-introduce EPOLLET, but now use it uniformly for all tap modes. The main change this requires is making sure on EPOLLIN we loop until all there's no more data to process. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tap.c b/tap.c index 41af6a6d..c1db2960 100644 --- a/tap.c +++ b/tap.c @@ -985,8 +985,10 @@ static void tap_sock_reset(struct ctx *c) * tap_passt_input() - Handler for new data on the socket to qemu * @c: Execution context * @now: Current timestamp + * + * Return: true if there may be additional data to read, false otherwise */ -static void tap_passt_input(struct ctx *c, const struct timespec *now) +static bool tap_passt_input(struct ctx *c, const struct timespec *now) { static const char *partial_frame; static ssize_t partial_len = 0; @@ -1013,7 +1015,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) err_perror("Receive error on guest connection, reset"); tap_sock_reset(c); } - return; + return false; } p = pkt_buf; @@ -1025,7 +1027,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) { err("Bad frame size from guest, resetting connection"); tap_sock_reset(c); - return; + return false; } if (l2len + sizeof(uint32_t) > (size_t)n) @@ -1045,6 +1047,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) partial_frame = p; tap_handler(c, now); + + return true; } /** @@ -1061,16 +1065,20 @@ void tap_handler_passt(struct ctx *c, uint32_t events, return; } - if (events & EPOLLIN) - tap_passt_input(c, now); + if (events & EPOLLIN) { + while (tap_passt_input(c, now)) + ; + } } /** * tap_pasta_input() - Handler for new data on the socket to hypervisor * @c: Execution context * @now: Current timestamp + * + * Return: true if there may be additional data to read, false otherwise */ -static void tap_pasta_input(struct ctx *c, const struct timespec *now) +static bool tap_pasta_input(struct ctx *c, const struct timespec *now) { ssize_t n, len; @@ -1102,6 +1110,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) } tap_handler(c, now); + + return len > 0; } /** @@ -1116,8 +1126,10 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) die("Disconnect event on /dev/net/tun device, exiting"); - if (events & EPOLLIN) - tap_pasta_input(c, now); + if (events & EPOLLIN) { + while (tap_pasta_input(c, now)) + ; + } } /** @@ -1251,7 +1263,7 @@ void tap_listen_handler(struct ctx *c, uint32_t events) trace("tap: failed to set SO_SNDBUF to %i", v); ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLRDHUP; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } @@ -1307,7 +1319,7 @@ static void tap_sock_tun_init(struct ctx *c) pasta_ns_conf(c); ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLRDHUP; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } @@ -1340,7 +1352,7 @@ void tap_sock_init(struct ctx *c) else ref.type = EPOLL_TYPE_TAP_PASTA; - ev.events = EPOLLIN | EPOLLRDHUP; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); return; -- 2.46.0