On Thu, Jul 24, 2025 at 3:34 AM David Gibson
On Wed, Jul 09, 2025 at 07:47:48PM +0200, Eugenio Pérez wrote:
Now both tcp_sock and tap uses the circular buffer as intended.
Very lightly tested. Especially, paths like ring full or almost full that are checked before producing like tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM.
Processing the tx buffers in a circular buffer makes namespace rx go from to ~11.5Gbit/s. to ~17.26Gbit/s.
TODO: Increase the tx queue length, as we spend a lot of descriptors in each request. Ideally, tx size should be at least bufs_per_frame*TCP_FRAMES_MEM, but maybe we got more performance with bigger queues.
TODO: Sometimes we call tcp_buf_free_old_tap_xmit twice: one to free at least N used tx buffers and the next one in tcp_payload_flush. Maybe we can optimize it.
Signed-off-by: Eugenio Pérez
--- tcp_buf.c | 130 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 24 deletions(-) diff --git a/tcp_buf.c b/tcp_buf.c index f74d22d..326af79 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -53,13 +53,66 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516")
/* References tracking the owner connection of frames in the tap outqueue */ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; -static unsigned int tcp_payload_sock_used, tcp_payload_tap_used; + +/* + * sock_head: Head of buffers available for writing. tcp_data_to_tap moves it + * forward, but errors queueing to vhost can move it backwards to tap_head + * again. + * + * tap_head: Head of buffers that have been sent to vhost. flush moves this + * forward. + * + * tail: Chasing index. Increments when vhost uses buffers. + * + * _used: Independent variables to tell between full and empty.
Hm. I kind of hope there's a less bulky way of doing this.
The other option I know is to always keep one entry unused.
+ */ +static unsigned int tcp_payload_sock_head, tcp_payload_tap_head, tcp_payload_tail, tcp_payload_sock_used, tcp_payload_tap_used; +#define IS_POW2(y) (((y) > 0) && !((y) & ((y) - 1)))
Worth putting this in util.h as a separate patch.
Agree.
+static_assert(ARRAY_SIZE(tcp_payload) == TCP_FRAMES_MEM, "TCP_FRAMES_MEM is not the size of tcp_payload anymore"); +static_assert(IS_POW2(TCP_FRAMES_MEM), "TCP_FRAMES_MEM must be a power of two"); + +static size_t tcp_payload_cnt_to_end(size_t head, size_t tail) +{ + assert(head != tail); + size_t end = ARRAY_SIZE(tcp_payload) - tail; + size_t n = (head + end) % ARRAY_SIZE(tcp_payload); + + return MIN(n, end); +} + +/* Count the number of items that has been written from sock to the + * curcular buffer and can be sent to tap.
s/curcular/circular/g
Thanks for the catch, fixing in the next release!
+ */ +static size_t tcp_payload_tap_cnt(void) +{ + return tcp_payload_sock_used - tcp_payload_tap_used; +}
static void tcp_payload_sock_produce(size_t n) { + tcp_payload_sock_head = (tcp_payload_sock_head + n) % ARRAY_SIZE(tcp_payload); tcp_payload_sock_used += n; }
+/* Count the number of consecutive items that has been written from sock to the + * curcular buffer and can be sent to tap without having to wrap back to the + * beginning of the buffer. + */ +static size_t tcp_payload_tap_cnt_to_end(void) +{ + if (tcp_payload_sock_head == tcp_payload_tap_head) { + /* empty? */ + if (tcp_payload_sock_used - tcp_payload_tap_used == 0) + return 0; + + /* full */ + return ARRAY_SIZE(tcp_payload) - tcp_payload_tap_head; + } + + return tcp_payload_cnt_to_end(tcp_payload_sock_head, + tcp_payload_tap_head); +} + static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
/** @@ -137,14 +190,13 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, } }
-static void tcp_buf_free_old_tap_xmit(const struct ctx *c) +static void tcp_buf_free_old_tap_xmit(const struct ctx *c, size_t target) { - while (tcp_payload_tap_used) { - tap_free_old_xmit(c, tcp_payload_tap_used); + size_t n = tap_free_old_xmit(c, target);
- tcp_payload_tap_used = 0; - tcp_payload_sock_used = 0; - } + tcp_payload_tail = (tcp_payload_tail + n) & (ARRAY_SIZE(tcp_payload) - 1);
use % instead of & here - it's consistent with other places, and the compiler should be able to optimize it to the same thing.
+ tcp_payload_tap_used -= n; + tcp_payload_sock_used -= n; }
/** @@ -153,16 +205,33 @@ static void tcp_buf_free_old_tap_xmit(const struct ctx *c) */ void tcp_payload_flush(const struct ctx *c) { - size_t m; + size_t m, n = tcp_payload_tap_cnt_to_end(); + struct iovec *head = &tcp_l2_iov[tcp_payload_tap_head][0];
- m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS, - tcp_payload_sock_used, true); - if (m != tcp_payload_sock_used) { - tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], - tcp_payload_sock_used - m); - } + tcp_buf_free_old_tap_xmit(c, (size_t)-1); + m = tap_send_frames(c, head, TCP_NUM_IOVS, n, true); tcp_payload_tap_used += m; - tcp_buf_free_old_tap_xmit(c); + tcp_payload_tap_head = (tcp_payload_tap_head + m) % + ARRAY_SIZE(tcp_payload); + + if (m != n) { + n = tcp_payload_tap_cnt_to_end(); + + tcp_revert_seq(c, &tcp_frame_conns[tcp_payload_tap_head], + &tcp_l2_iov[tcp_payload_tap_head], n); + /* + * circular buffer wrap case. + * TODO: Maybe it's better to adapt tcp_revert_seq. + */ + tcp_revert_seq(c, &tcp_frame_conns[0], &tcp_l2_iov[0], + tcp_payload_tap_cnt() - n); + + tcp_payload_sock_head = tcp_payload_tap_head; + tcp_payload_sock_used = tcp_payload_tap_used; + } else if (tcp_payload_tap_cnt_to_end()) { + /* circular buffer wrap case */ + tcp_payload_flush(c); + } }
/** @@ -209,14 +278,15 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) size_t optlen; size_t l4len; uint32_t seq; + unsigned int i = tcp_payload_sock_head; int ret;
- iov = tcp_l2_iov[tcp_payload_sock_used]; + iov = tcp_l2_iov[i]; if (CONN_V4(conn)) { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; } else { - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; }
@@ -228,13 +298,15 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) return ret;
tcp_payload_sock_produce(1); + i = tcp_payload_sock_head; l4len = optlen + sizeof(struct tcphdr); iov[TCP_IOV_PAYLOAD].iov_len = l4len; tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) { - struct iovec *dup_iov = tcp_l2_iov[tcp_payload_sock_used]; + struct iovec *dup_iov = tcp_l2_iov[i]; tcp_payload_sock_produce(1); + i = tcp_payload_sock_head;
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_len); @@ -246,7 +318,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) }
if (tcp_payload_sock_used > TCP_FRAMES_MEM - 2) { + tcp_buf_free_old_tap_xmit(c, 2); tcp_payload_flush(c); + /* TODO how to fix this? original code didn't chech for success either */ + assert(tcp_payload_sock_used <= TCP_FRAMES_MEM - 2); }
return 0; @@ -269,16 +344,17 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov;
conn->seq_to_tap = seq + dlen; - tcp_frame_conns[tcp_payload_sock_used] = conn; - iov = tcp_l2_iov[tcp_payload_sock_used]; + tcp_frame_conns[tcp_payload_sock_head] = conn; + iov = tcp_l2_iov[tcp_payload_sock_head]; if (CONN_V4(conn)) { if (no_csum) { - struct iovec *iov_prev = tcp_l2_iov[tcp_payload_sock_used - 1]; + unsigned prev = (tcp_payload_sock_head - 1) % TCP_FRAMES_MEM; + struct iovec *iov_prev = tcp_l2_iov[prev]; struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check; } - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_head]); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; } else if (CONN_V6(conn)) { iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]); @@ -294,8 +370,11 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_l2_buf_fill_headers(conn, iov, check, seq, false); tcp_payload_sock_produce(1); if (tcp_payload_sock_used > TCP_FRAMES_MEM - 1) { + tcp_buf_free_old_tap_xmit(c, 1); tcp_payload_flush(c); + assert(tcp_payload_sock_used <= TCP_FRAMES_MEM - 1); } + }
/** @@ -362,11 +441,14 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) }
if (tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM) { + tcp_buf_free_old_tap_xmit(c, fill_bufs); tcp_payload_flush(c); + /* TODO how to report this to upper layers? */ + assert(tcp_payload_sock_used + fill_bufs <= TCP_FRAMES_MEM); }
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - iov->iov_base = &tcp_payload[tcp_payload_sock_used + i].data; + iov->iov_base = &tcp_payload[(tcp_payload_sock_head + i) % TCP_FRAMES_MEM].data; iov->iov_len = mss; } if (iov_rem)
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson