The vhost-kernel module is async by nature: the driver (pasta) places a
few buffers in the virtqueue and the device (vhost-kernel) trust the
driver will not modify them until it uses them. To implement it is not
possible with TCP at the moment, as tcp_buf trust it can reuse the
buffers as soon as tcp_payload_flush() finish.
To achieve async let's make tcp_buf work with a circular ring, so vhost
can transmit at the same time pasta is queing more data. When a buffer
is received from a TCP socket, the element is placed in the ring and
sock_head is moved:
[][][][]
^ ^
| |
| sock_head
|
tail
tap_head
When the data is sent to vhost through the tx queue, tap_head is moved
forward:
[][][][]
^ ^
| |
| sock_head
| tap_head
|
tail
Finally, the tail move forward when vhost has used the tx buffers, so
tcp_payload (and all lower protocol buffers) can be reused.
[][][][]
^
|
sock_head
tap_head
tail
In the case of error queueing to the vhost virtqueue, sock_head moves
backwards. The only possible error is that the queue is full, as
virtio-net does not report success on packet sending.
Starting as simple as possible, and only implementing the count
variables in this patch so it keeps working as previously. The circular
behavior will be added on top.
From ~16BGbit/s to ~13Gbit/s compared with write(2) to the tap.
Signed-off-by: Eugenio Pérez
---
tcp_buf.c | 63 +++++++++++++++++++++++++++++++++++--------------------
1 file changed, 40 insertions(+), 23 deletions(-)
diff --git a/tcp_buf.c b/tcp_buf.c
index 242086d..0437120 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -53,7 +53,12 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516")
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp_payload_used;
+static unsigned int tcp_payload_sock_used, tcp_payload_tap_used;
+
+static void tcp_payload_sock_produce(size_t n)
+{
+ tcp_payload_sock_used += n;
+}
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -132,6 +137,16 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
}
}
+static void tcp_buf_free_old_tap_xmit(void)
+{
+ while (tcp_payload_tap_used) {
+ tap_free_old_xmit(tcp_payload_tap_used);
+
+ tcp_payload_tap_used = 0;
+ tcp_payload_sock_used = 0;
+ }
+}
+
/**
* tcp_payload_flush() - Send out buffers for segments with data or flags
* @c: Execution context
@@ -141,12 +156,13 @@ void tcp_payload_flush(const struct ctx *c)
size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
- tcp_payload_used, false);
- if (m != tcp_payload_used) {
+ tcp_payload_sock_used, true);
+ if (m != tcp_payload_sock_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
- tcp_payload_used - m);
+ tcp_payload_sock_used - m);
}
- tcp_payload_used = 0;
+ tcp_payload_tap_used += m;
+ tcp_buf_free_old_tap_xmit();
}
/**
@@ -195,12 +211,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
uint32_t seq;
int ret;
- iov = tcp_l2_iov[tcp_payload_used];
+ iov = tcp_l2_iov[tcp_payload_sock_used];
if (CONN_V4(conn)) {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
@@ -211,13 +227,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (ret <= 0)
return ret;
- tcp_payload_used++;
+ tcp_payload_sock_produce(1);
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
- struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
+ struct iovec *dup_iov = tcp_l2_iov[tcp_payload_sock_used];
+ tcp_payload_sock_produce(1);
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
@@ -228,8 +245,9 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
}
- if (tcp_payload_used > TCP_FRAMES_MEM - 2)
+ if (tcp_payload_sock_used > TCP_FRAMES_MEM - 2) {
tcp_payload_flush(c);
+ }
return 0;
}
@@ -251,19 +269,19 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
struct iovec *iov;
conn->seq_to_tap = seq + dlen;
- tcp_frame_conns[tcp_payload_used] = conn;
- iov = tcp_l2_iov[tcp_payload_used];
+ tcp_frame_conns[tcp_payload_sock_used] = conn;
+ iov = tcp_l2_iov[tcp_payload_sock_used];
if (CONN_V4(conn)) {
if (no_csum) {
- struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
+ struct iovec *iov_prev = tcp_l2_iov[tcp_payload_sock_used - 1];
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
@@ -274,8 +292,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
- if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_sock_produce(1);
+ if (tcp_payload_sock_used > TCP_FRAMES_MEM - 1) {
tcp_payload_flush(c);
+ }
}
/**
@@ -341,15 +361,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
mh_sock.msg_iovlen = fill_bufs;
}
- if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
+ if (tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM) {
tcp_payload_flush(c);
-
- /* Silence Coverity CWE-125 false positive */
- tcp_payload_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
- iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
+ iov->iov_base = &tcp_payload[tcp_payload_sock_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
@@ -407,7 +424,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
dlen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
- int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
+ int no_csum = i && i != send_bufs - 1 && tcp_payload_sock_used;
bool push = false;
if (i == send_bufs - 1) {
--
2.50.0