On Wed, Jul 09, 2025 at 07:47:46PM +0200, Eugenio Pérez wrote:
The vhost-kernel module is async by nature: the driver (pasta) places a few buffers in the virtqueue and the device (vhost-kernel) trust the
s/trust/trusts/
driver will not modify them until it uses them. To implement it is not possible with TCP at the moment, as tcp_buf trust it can reuse the buffers as soon as tcp_payload_flush() finish.
To achieve async let's make tcp_buf work with a circular ring, so vhost can transmit at the same time pasta is queing more data. When a buffer is received from a TCP socket, the element is placed in the ring and sock_head is moved: [][][][] ^ ^ | | | sock_head | tail tap_head
When the data is sent to vhost through the tx queue, tap_head is moved forward: [][][][] ^ ^ | | | sock_head | tap_head | tail
Finally, the tail move forward when vhost has used the tx buffers, so tcp_payload (and all lower protocol buffers) can be reused. [][][][] ^ | sock_head tap_head tail
This all sounds good. I wonder if it might be clearer to do this circular queue conversion as a separate patch series. I think it makes sense even without the context of vhost (it's closer to how most network things work).
In the case of error queueing to the vhost virtqueue, sock_head moves backwards. The only possible error is that the queue is full, as
sock_head moves backwards? Or tap_head moves backwards?
virtio-net does not report success on packet sending.
Starting as simple as possible, and only implementing the count variables in this patch so it keeps working as previously. The circular behavior will be added on top.
From ~16BGbit/s to ~13Gbit/s compared with write(2) to the tap.
I don't really understand what you're comparing here.
Signed-off-by: Eugenio Pérez
--- tcp_buf.c | 63 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index 242086d..0437120 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -53,7 +53,12 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516")
/* References tracking the owner connection of frames in the tap outqueue */ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; -static unsigned int tcp_payload_used; +static unsigned int tcp_payload_sock_used, tcp_payload_tap_used;
I think the "payload" here is a hangover from when we had separate queues for flags-only and data-containing packets. We can probably drop it and make a bunch of names shorter.
+static void tcp_payload_sock_produce(size_t n) +{ + tcp_payload_sock_used += n; +}
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -132,6 +137,16 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, } }
+static void tcp_buf_free_old_tap_xmit(void) +{ + while (tcp_payload_tap_used) { + tap_free_old_xmit(tcp_payload_tap_used); + + tcp_payload_tap_used = 0; + tcp_payload_sock_used = 0; + } +} + /** * tcp_payload_flush() - Send out buffers for segments with data or flags * @c: Execution context @@ -141,12 +156,13 @@ void tcp_payload_flush(const struct ctx *c) size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS, - tcp_payload_used, false); - if (m != tcp_payload_used) { + tcp_payload_sock_used, true); + if (m != tcp_payload_sock_used) { tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], - tcp_payload_used - m); + tcp_payload_sock_used - m); } - tcp_payload_used = 0; + tcp_payload_tap_used += m; + tcp_buf_free_old_tap_xmit(); }
/** @@ -195,12 +211,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) uint32_t seq; int ret;
- iov = tcp_l2_iov[tcp_payload_used]; + iov = tcp_l2_iov[tcp_payload_sock_used]; if (CONN_V4(conn)) { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; } else { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; }
@@ -211,13 +227,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) if (ret <= 0) return ret;
- tcp_payload_used++; + tcp_payload_sock_produce(1); l4len = optlen + sizeof(struct tcphdr); iov[TCP_IOV_PAYLOAD].iov_len = l4len; tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) { - struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; + struct iovec *dup_iov = tcp_l2_iov[tcp_payload_sock_used]; + tcp_payload_sock_produce(1);
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_len); @@ -228,8 +245,9 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; }
- if (tcp_payload_used > TCP_FRAMES_MEM - 2) + if (tcp_payload_sock_used > TCP_FRAMES_MEM - 2) { tcp_payload_flush(c); + }
No { } here in passt style.
return 0; } @@ -251,19 +269,19 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov;
conn->seq_to_tap = seq + dlen; - tcp_frame_conns[tcp_payload_used] = conn; - iov = tcp_l2_iov[tcp_payload_used]; + tcp_frame_conns[tcp_payload_sock_used] = conn; + iov = tcp_l2_iov[tcp_payload_sock_used]; if (CONN_V4(conn)) { if (no_csum) { - struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1]; + struct iovec *iov_prev = tcp_l2_iov[tcp_payload_sock_used - 1]; struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check; } - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; } else if (CONN_V6(conn)) { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; } payload = iov[TCP_IOV_PAYLOAD].iov_base; @@ -274,8 +292,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, payload->th.psh = push; iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); tcp_l2_buf_fill_headers(conn, iov, check, seq, false); - if (++tcp_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_sock_produce(1); + if (tcp_payload_sock_used > TCP_FRAMES_MEM - 1) { tcp_payload_flush(c); + } }
/** @@ -341,15 +361,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) mh_sock.msg_iovlen = fill_bufs; }
- if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { + if (tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM) { tcp_payload_flush(c); - - /* Silence Coverity CWE-125 false positive */ - tcp_payload_used = 0; }
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - iov->iov_base = &tcp_payload[tcp_payload_used + i].data; + iov->iov_base = &tcp_payload[tcp_payload_sock_used + i].data; iov->iov_len = mss; } if (iov_rem) @@ -407,7 +424,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) dlen = mss; seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp_payload_used; + int no_csum = i && i != send_bufs - 1 && tcp_payload_sock_used; bool push = false;
if (i == send_bufs - 1) {
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson