On Thu, Jul 24, 2025 at 3:21 AM David Gibson
On Wed, Jul 09, 2025 at 07:47:47PM +0200, Eugenio Pérez wrote:
From ~13Gbit/s to ~11.5Gbit/s.
Again, I really don't know what you're comparing to what here.
When the buffer is full I'm using poll() to wait until vhost free some buffers, instead of actively checking the used index. This is the cost of the syscall. We could mitigate some of this by making the tx queue larger though.
TODO: Maybe we can reuse epoll for this, not needing to introduce a new syscall.
I really hope so.
Signed-off-by: Eugenio Pérez
--- tap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------- tap.h | 2 +- tcp_buf.c | 6 +++--- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/tap.c b/tap.c index 55357e3..93a8c12 100644 --- a/tap.c +++ b/tap.c @@ -19,6 +19,7 @@ #include
#include #include +#include #include #include #include @@ -120,7 +121,7 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf); #define VHOST_NDESCS (PKT_BUF_BYTES / 65520) static_assert(!(VHOST_NDESCS & (VHOST_NDESCS - 1)), "Number of vhost descs must be a power of two by standard"); -static struct { +static struct vhost_virtqueue { /* Descriptor index we're using. This is not the same as avail idx in * split: this takes into account the chained descs */ uint16_t vring_idx; @@ -472,26 +473,63 @@ static void vhost_kick(struct vring_used *used, int kick_fd) { eventfd_write(kick_fd, 1); } +/* + * #syscalls:pasta read poll + */ +static uint16_t used_idx(struct vhost_virtqueue *vq, + struct vring_avail *avail, + const struct vring_used *used, int pollfd) +{ + struct pollfd fds = { .fd = pollfd, .events = POLLIN }; + int r; + + if (vq->shadow_used_idx == vq->last_used_idx) + vq->shadow_used_idx = le16toh(used->idx); + + if (vq->shadow_used_idx != vq->last_used_idx || pollfd < 0) + return vq->shadow_used_idx; + + avail->flags &= ~htole16(1ULL << VRING_AVAIL_F_NO_INTERRUPT); + /* trusting syscall for smp_wb() */ + r = read(pollfd, (uint64_t[]){0}, sizeof(uint64_t)); + assert((r < 0 && errno == EAGAIN) || r == 8); + + /* Another oportunity before syscalls */ + vq->shadow_used_idx = le16toh(used->idx); + if (vq->shadow_used_idx != vq->last_used_idx) { + return vqs->shadow_used_idx; + } + + r = poll(&fds, 1, -1); + assert (0 < r); + avail->flags |= htole16(1ULL << VRING_AVAIL_F_NO_INTERRUPT); + vq->shadow_used_idx = le16toh(used->idx); + return vq->shadow_used_idx;
I don't understand what this is accomplishing. It seems like it's blocking waiting for a buffer to be freed, which seems like exactly what we don't want to do.
+} + /* n = target */ -void tap_free_old_xmit(size_t n) +size_t tap_free_old_xmit(const struct ctx *c, size_t n) { size_t r = 0; + int pollfd = (n == (size_t)-1) ? -1 : c->vq[1].call_fd;
while (r < n) { - uint16_t used_idx = vqs[1].last_used_idx; - if (vqs[1].shadow_used_idx == used_idx) { - vqs[1].shadow_used_idx = le16toh(*(volatile uint16_t*)&vring_used_1.used.idx); - - if (vqs[1].shadow_used_idx == used_idx) - continue; + uint16_t last_used = vqs[1].last_used_idx; + if (used_idx(&vqs[1], &vring_avail_1.avail, &vring_used_1.used, pollfd) == last_used) { + assert(pollfd == -1); + return r; }
+ /* Only get used array entries after they have been exposed by vhost. */ + smp_rmb(); /* assert in-order */ - assert(vring_used_1.used.ring[used_idx % VHOST_NDESCS].id == vring_avail_1.avail.ring[used_idx % VHOST_NDESCS]); - vqs[1].num_free += vqs[1].ndescs[used_idx % VHOST_NDESCS]; + assert(vring_used_1.used.ring[last_used % VHOST_NDESCS].id == vring_avail_1.avail.ring[last_used % VHOST_NDESCS]); + vqs[1].num_free += vqs[1].ndescs[last_used % VHOST_NDESCS]; vqs[1].last_used_idx++; r++; } + + return r; }
/** @@ -1687,6 +1725,7 @@ static int tap_ns_tun(void *arg) if (rc < 0) die_perror("Failed to add call eventfd to epoll"); } + fprintf(stderr, "[eperezma %s:%d][i=%d][call_fd=%d]\n", __func__, __LINE__, i, file.fd); c->vq[i].call_fd = file.fd;
file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); diff --git a/tap.h b/tap.h index 7ca0fb0..7004116 100644 --- a/tap.h +++ b/tap.h @@ -112,7 +112,7 @@ void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, const void *in, size_t l4len); void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost); -void tap_free_old_xmit(size_t n); +size_t tap_free_old_xmit(const struct ctx *c, size_t n); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes, bool vhost); void eth_update_mac(struct ethhdr *eh, diff --git a/tcp_buf.c b/tcp_buf.c index 0437120..f74d22d 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -137,10 +137,10 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, } }
-static void tcp_buf_free_old_tap_xmit(void) +static void tcp_buf_free_old_tap_xmit(const struct ctx *c) { while (tcp_payload_tap_used) { - tap_free_old_xmit(tcp_payload_tap_used); + tap_free_old_xmit(c, tcp_payload_tap_used);
tcp_payload_tap_used = 0; tcp_payload_sock_used = 0; @@ -162,7 +162,7 @@ void tcp_payload_flush(const struct ctx *c) tcp_payload_sock_used - m); } tcp_payload_tap_used += m; - tcp_buf_free_old_tap_xmit(); + tcp_buf_free_old_tap_xmit(c); }
/**
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson