On Wed, Jul 09, 2025 at 07:47:47PM +0200, Eugenio Pérez wrote:
From ~13Gbit/s to ~11.5Gbit/s.
Again, I really don't know what you're comparing to what here.
TODO: Maybe we can reuse epoll for this, not needing to introduce a new syscall.
I really hope so.
Signed-off-by: Eugenio Pérez
--- tap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---------- tap.h | 2 +- tcp_buf.c | 6 +++--- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/tap.c b/tap.c index 55357e3..93a8c12 100644 --- a/tap.c +++ b/tap.c @@ -19,6 +19,7 @@ #include
#include #include +#include #include #include #include @@ -120,7 +121,7 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf); #define VHOST_NDESCS (PKT_BUF_BYTES / 65520) static_assert(!(VHOST_NDESCS & (VHOST_NDESCS - 1)), "Number of vhost descs must be a power of two by standard"); -static struct { +static struct vhost_virtqueue { /* Descriptor index we're using. This is not the same as avail idx in * split: this takes into account the chained descs */ uint16_t vring_idx; @@ -472,26 +473,63 @@ static void vhost_kick(struct vring_used *used, int kick_fd) { eventfd_write(kick_fd, 1); } +/* + * #syscalls:pasta read poll + */ +static uint16_t used_idx(struct vhost_virtqueue *vq, + struct vring_avail *avail, + const struct vring_used *used, int pollfd) +{ + struct pollfd fds = { .fd = pollfd, .events = POLLIN }; + int r; + + if (vq->shadow_used_idx == vq->last_used_idx) + vq->shadow_used_idx = le16toh(used->idx); + + if (vq->shadow_used_idx != vq->last_used_idx || pollfd < 0) + return vq->shadow_used_idx; + + avail->flags &= ~htole16(1ULL << VRING_AVAIL_F_NO_INTERRUPT); + /* trusting syscall for smp_wb() */ + r = read(pollfd, (uint64_t[]){0}, sizeof(uint64_t)); + assert((r < 0 && errno == EAGAIN) || r == 8); + + /* Another oportunity before syscalls */ + vq->shadow_used_idx = le16toh(used->idx); + if (vq->shadow_used_idx != vq->last_used_idx) { + return vqs->shadow_used_idx; + } + + r = poll(&fds, 1, -1); + assert (0 < r); + avail->flags |= htole16(1ULL << VRING_AVAIL_F_NO_INTERRUPT); + vq->shadow_used_idx = le16toh(used->idx); + return vq->shadow_used_idx;
I don't understand what this is accomplishing. It seems like it's blocking waiting for a buffer to be freed, which seems like exactly what we don't want to do.
+} + /* n = target */ -void tap_free_old_xmit(size_t n) +size_t tap_free_old_xmit(const struct ctx *c, size_t n) { size_t r = 0; + int pollfd = (n == (size_t)-1) ? -1 : c->vq[1].call_fd;
while (r < n) { - uint16_t used_idx = vqs[1].last_used_idx; - if (vqs[1].shadow_used_idx == used_idx) { - vqs[1].shadow_used_idx = le16toh(*(volatile uint16_t*)&vring_used_1.used.idx); - - if (vqs[1].shadow_used_idx == used_idx) - continue; + uint16_t last_used = vqs[1].last_used_idx; + if (used_idx(&vqs[1], &vring_avail_1.avail, &vring_used_1.used, pollfd) == last_used) { + assert(pollfd == -1); + return r; }
+ /* Only get used array entries after they have been exposed by vhost. */ + smp_rmb(); /* assert in-order */ - assert(vring_used_1.used.ring[used_idx % VHOST_NDESCS].id == vring_avail_1.avail.ring[used_idx % VHOST_NDESCS]); - vqs[1].num_free += vqs[1].ndescs[used_idx % VHOST_NDESCS]; + assert(vring_used_1.used.ring[last_used % VHOST_NDESCS].id == vring_avail_1.avail.ring[last_used % VHOST_NDESCS]); + vqs[1].num_free += vqs[1].ndescs[last_used % VHOST_NDESCS]; vqs[1].last_used_idx++; r++; } + + return r; }
/** @@ -1687,6 +1725,7 @@ static int tap_ns_tun(void *arg) if (rc < 0) die_perror("Failed to add call eventfd to epoll"); } + fprintf(stderr, "[eperezma %s:%d][i=%d][call_fd=%d]\n", __func__, __LINE__, i, file.fd); c->vq[i].call_fd = file.fd;
file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); diff --git a/tap.h b/tap.h index 7ca0fb0..7004116 100644 --- a/tap.h +++ b/tap.h @@ -112,7 +112,7 @@ void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, const void *in, size_t l4len); void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost); -void tap_free_old_xmit(size_t n); +size_t tap_free_old_xmit(const struct ctx *c, size_t n); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes, bool vhost); void eth_update_mac(struct ethhdr *eh, diff --git a/tcp_buf.c b/tcp_buf.c index 0437120..f74d22d 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -137,10 +137,10 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, } }
-static void tcp_buf_free_old_tap_xmit(void) +static void tcp_buf_free_old_tap_xmit(const struct ctx *c) { while (tcp_payload_tap_used) { - tap_free_old_xmit(tcp_payload_tap_used); + tap_free_old_xmit(c, tcp_payload_tap_used);
tcp_payload_tap_used = 0; tcp_payload_sock_used = 0; @@ -162,7 +162,7 @@ void tcp_payload_flush(const struct ctx *c) tcp_payload_sock_used - m); } tcp_payload_tap_used += m; - tcp_buf_free_old_tap_xmit(); + tcp_buf_free_old_tap_xmit(c); }
/**
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson