Signed-off-by: Laurent Vivier
---
Makefile | 6 +-
tcp.c | 66 +++++---
tcp_vu.c | 447 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
tcp_vu.h | 10 ++
4 files changed, 502 insertions(+), 27 deletions(-)
create mode 100644 tcp_vu.c
create mode 100644 tcp_vu.h
diff --git a/Makefile b/Makefile
index 2016b071ddf2..f7a403d19b61 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \
igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \
passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \
- tcp_buf.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
+ tcp_buf.c tcp_vu.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@@ -56,8 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \
flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \
netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \
- tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \
- util.h iov.h ip.h virtio.h vhost_user.h
+ tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_vu.h tcp_internal.h \
+ udp.h util.h iov.h ip.h virtio.h vhost_user.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/tcp.c b/tcp.c
index b6aca9f37f19..e829e12fe7c2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -302,6 +302,7 @@
#include "flow_table.h"
#include "tcp_internal.h"
#include "tcp_buf.h"
+#include "tcp_vu.h"
/* Sides of a flow as we use them in "tap" connections */
#define SOCKSIDE 0
@@ -1034,7 +1035,7 @@ size_t ipv4_fill_headers(const struct ctx *c,
tcp_set_tcp_header(th, conn, seq);
th->check = 0;
- if (c->mode != MODE_VU || *c->pcap)
+ if (c->mode != MODE_VU)
th->check = tcp_update_check_tcp4(iph);
return ip_len;
@@ -1072,7 +1073,7 @@ size_t ipv6_fill_headers(const struct ctx *c,
tcp_set_tcp_header(th, conn, seq);
th->check = 0;
- if (c->mode != MODE_VU || *c->pcap)
+ if (c->mode != MODE_VU)
th->check = tcp_update_check_tcp6(ip6h);
ip6h->hop_limit = 255;
@@ -1302,6 +1303,12 @@ int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
return 1;
}
+int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+ if (c->mode == MODE_VU)
+ return tcp_vu_send_flag(c, conn, flags);
+ return tcp_buf_send_flag(c, conn, flags);
+}
/**
* tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
@@ -1313,7 +1320,7 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED)
return;
- if (!tcp_buf_send_flag(c, conn, RST))
+ if (!tcp_send_flag(c, conn, RST))
conn_event(c, conn, CLOSED);
}
@@ -1430,7 +1437,8 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
*
* Return: clamped MSS value
*/
-static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
+static uint16_t tcp_conn_tap_mss(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
const char *opts, size_t optlen)
{
unsigned int mss;
@@ -1441,7 +1449,10 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
else
mss = ret;
- mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
+ if (c->mode == MODE_VU)
+ mss = MIN(tcp_vu_conn_tap_mss(conn), mss);
+ else
+ mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
return MIN(mss, USHRT_MAX);
}
@@ -1568,7 +1579,7 @@ static void tcp_conn_from_tap(struct ctx *c,
conn->wnd_to_tap = WINDOW_DEFAULT;
- mss = tcp_conn_tap_mss(conn, opts, optlen);
+ mss = tcp_conn_tap_mss(c, conn, opts, optlen);
if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
MSS_SET(conn, mss);
@@ -1625,7 +1636,7 @@ static void tcp_conn_from_tap(struct ctx *c,
} else {
tcp_get_sndbuf(conn);
- if (tcp_buf_send_flag(c, conn, SYN | ACK))
+ if (tcp_send_flag(c, conn, SYN | ACK))
return;
conn_event(c, conn, TAP_SYN_ACK_SENT);
@@ -1673,6 +1684,13 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
return 0;
}
+static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ if (c->mode == MODE_VU)
+ return tcp_vu_data_from_sock(c, conn);
+
+ return tcp_buf_data_from_sock(c, conn);
+}
/**
* tcp_data_from_tap() - tap/guest data for established connection
@@ -1806,7 +1824,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
max_ack_seq, conn->seq_to_tap);
conn->seq_ack_from_tap = max_ack_seq;
conn->seq_to_tap = max_ack_seq;
- tcp_buf_data_from_sock(c, conn);
+ tcp_data_from_sock(c, conn);
}
if (!iov_i)
@@ -1822,14 +1840,14 @@ eintr:
* Then swiftly looked away and left.
*/
conn->seq_from_tap = seq_from_tap;
- tcp_buf_send_flag(c, conn, ACK);
+ tcp_send_flag(c, conn, ACK);
}
if (errno == EINTR)
goto eintr;
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED);
return p->count - idx;
}
@@ -1839,7 +1857,7 @@ eintr:
if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
partial_send = 1;
conn->seq_from_tap += n;
- tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED);
} else {
conn->seq_from_tap += n;
}
@@ -1852,7 +1870,7 @@ out:
*/
if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
- tcp_buf_send_flag(c, conn, DUP_ACK);
+ tcp_send_flag(c, conn, DUP_ACK);
}
return p->count - idx;
}
@@ -1866,7 +1884,7 @@ out:
conn_event(c, conn, TAP_FIN_RCVD);
} else {
- tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED);
}
return p->count - idx;
@@ -1891,7 +1909,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
conn->wnd_from_tap = 1;
- MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
+ MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
conn->seq_init_from_tap = ntohl(th->seq) + 1;
conn->seq_from_tap = conn->seq_init_from_tap;
@@ -1902,8 +1920,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
/* The client might have sent data already, which we didn't
* dequeue waiting for SYN,ACK from tap -- check now.
*/
- tcp_buf_data_from_sock(c, conn);
- tcp_buf_send_flag(c, conn, ACK);
+ tcp_data_from_sock(c, conn);
+ tcp_send_flag(c, conn, ACK);
}
/**
@@ -1983,7 +2001,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
conn->seq_from_tap++;
shutdown(conn->sock, SHUT_WR);
- tcp_buf_send_flag(c, conn, ACK);
+ tcp_send_flag(c, conn, ACK);
conn_event(c, conn, SOCK_FIN_SENT);
return 1;
@@ -1994,7 +2012,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
tcp_tap_window_update(conn, ntohs(th->window));
- tcp_buf_data_from_sock(c, conn);
+ tcp_data_from_sock(c, conn);
if (p->count - idx == 1)
return 1;
@@ -2024,7 +2042,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
shutdown(conn->sock, SHUT_WR);
conn_event(c, conn, SOCK_FIN_SENT);
- tcp_buf_send_flag(c, conn, ACK);
+ tcp_send_flag(c, conn, ACK);
ack_due = 0;
}
@@ -2058,7 +2076,7 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
return;
}
- if (tcp_buf_send_flag(c, conn, SYN | ACK))
+ if (tcp_send_flag(c, conn, SYN | ACK))
return;
conn_event(c, conn, TAP_SYN_ACK_SENT);
@@ -2126,7 +2144,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c,
conn->wnd_from_tap = WINDOW_DEFAULT;
- tcp_buf_send_flag(c, conn, SYN);
+ tcp_send_flag(c, conn, SYN);
conn_flag(c, conn, ACK_FROM_TAP_DUE);
tcp_get_sndbuf(conn);
@@ -2190,7 +2208,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
return;
if (conn->flags & ACK_TO_TAP_DUE) {
- tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+ tcp_send_flag(c, conn, ACK_IF_NEEDED);
tcp_timer_ctl(c, conn);
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED)) {
@@ -2206,7 +2224,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
flow_dbg(conn, "ACK timeout, retry");
conn->retrans++;
conn->seq_to_tap = conn->seq_ack_from_tap;
- tcp_buf_data_from_sock(c, conn);
+ tcp_data_from_sock(c, conn);
tcp_timer_ctl(c, conn);
}
} else {
@@ -2261,7 +2279,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
conn_event(c, conn, SOCK_FIN_RCVD);
if (events & EPOLLIN)
- tcp_buf_data_from_sock(c, conn);
+ tcp_data_from_sock(c, conn);
if (events & EPOLLOUT)
tcp_update_seqack_wnd(c, conn, 0, NULL);
diff --git a/tcp_vu.c b/tcp_vu.c
new file mode 100644
index 000000000000..ed59b21cabdc
--- /dev/null
+++ b/tcp_vu.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include
+#include
+#include
+
+#include
+
+#include
+
+#include
+#include
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "siphash.h"
+#include "inany.h"
+#include "vhost_user.h"
+#include "tcp.h"
+#include "pcap.h"
+#include "flow.h"
+#include "tcp_conn.h"
+#include "flow_table.h"
+#include "tcp_vu.h"
+#include "tcp_internal.h"
+#include "checksum.h"
+
+#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn) (!CONN_V4(conn))
+
+/* vhost-user */
+static const struct virtio_net_hdr vu_header = {
+ .flags = VIRTIO_NET_HDR_F_DATA_VALID,
+ .gso_type = VIRTIO_NET_HDR_GSO_NONE,
+};
+
+static unsigned char buffer[65536];
+static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE];
+static unsigned int indexes [VIRTQUEUE_MAX_SIZE];
+
+uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
+{
+ (void)conn;
+ return USHRT_MAX;
+}
+
+int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+ VuDev *vdev = (VuDev *)&c->vdev;
+ VuVirtqElement *elem;
+ VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ struct virtio_net_hdr_mrg_rxbuf *vh;
+ size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
+ struct ethhdr *eh;
+ int ret;
+ int nb_ack;
+
+ elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
+ if (!elem)
+ return 0;
+
+ if (elem->in_num < 1) {
+ err("virtio-net receive queue contains no in buffers");
+ vu_queue_rewind(vdev, vq, 1);
+ return 0;
+ }
+
+ /* Options: MSS, NOP and window scale (8 bytes) */
+ if (flags & SYN)
+ optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
+
+ vh = elem->in_sg[0].iov_base;
+
+ vh->hdr = vu_header;
+ if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
+ vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ vh->num_buffers = htole16(1);
+ } else {
+ vnet_hdrlen = sizeof(struct virtio_net_hdr);
+ }
+ eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);
+
+ memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
+ memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
+
+ if (CONN_V4(conn)) {
+ struct iphdr *iph = (struct iphdr *)(eh + 1);
+ struct tcphdr *th = (struct tcphdr *)(iph + 1);
+ char *data = (char *)(th + 1);
+
+ eh->h_proto = htons(ETH_P_IP);
+
+ *th = (struct tcphdr){
+ .doff = sizeof(struct tcphdr) / 4,
+ .ack = 1
+ };
+
+ *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+
+ ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
+ if (ret <= 0) {
+ vu_queue_rewind(vdev, vq, 1);
+ return ret;
+ }
+
+ ip_len = ipv4_fill_headers(c, conn, iph, optlen, NULL,
+ conn->seq_to_tap);
+
+ tlen = ip_len + sizeof(struct ethhdr);
+
+ if (*c->pcap) {
+ uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
+
+ th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
+ }
+ } else {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
+ struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
+ char *data = (char *)(th + 1);
+
+ eh->h_proto = htons(ETH_P_IPV6);
+
+ *th = (struct tcphdr){
+ .doff = sizeof(struct tcphdr) / 4,
+ .ack = 1
+ };
+
+ *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+
+ ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
+ if (ret <= 0) {
+ vu_queue_rewind(vdev, vq, 1);
+ return ret;
+ }
+
+ ip_len = ipv6_fill_headers(c, conn, ip6h, optlen,
+ conn->seq_to_tap);
+
+ tlen = ip_len + sizeof(struct ethhdr);
+
+ if (*c->pcap) {
+ uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
+
+ th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
+ }
+ }
+
+ pcap((void *)eh, tlen);
+
+ tlen += vnet_hdrlen;
+ vu_queue_fill(vdev, vq, elem, tlen, 0);
+ nb_ack = 1;
+
+ if (flags & DUP_ACK) {
+ elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
+ if (elem) {
+ if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
+ vu_queue_rewind(vdev, vq, 1);
+ } else {
+ memcpy(elem->in_sg[0].iov_base, vh, tlen);
+ nb_ack++;
+ }
+ }
+ }
+
+ vu_queue_flush(vdev, vq, nb_ack);
+ vu_queue_notify(vdev, vq);
+
+ return 0;
+}
+
+int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+ uint32_t already_sent;
+ VuDev *vdev = (VuDev *)&c->vdev;
+ VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+ int s = conn->sock, v4 = CONN_V4(conn);
+ int i, ret = 0, iov_count, iov_used;
+ struct msghdr mh_sock = { 0 };
+ size_t l2_hdrlen, vnet_hdrlen, fillsize;
+ ssize_t len;
+ uint16_t *check;
+ uint16_t mss = MSS_GET(conn);
+ int num_buffers;
+ int segment_size;
+ struct iovec *first;
+ bool has_mrg_rxbuf;
+
+ if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
+ err("Got packet, but no available descriptors on RX virtq.");
+ return 0;
+ }
+
+ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
+
+ if (SEQ_LT(already_sent, 0)) {
+ /* RFC 761, section 2.1. */
+ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ already_sent = 0;
+ }
+
+ if (!wnd_scaled || already_sent >= wnd_scaled) {
+ conn_flag(c, conn, STALLED);
+ conn_flag(c, conn, ACK_FROM_TAP_DUE);
+ return 0;
+ }
+
+ /* Set up buffer descriptors we'll fill completely and partially. */
+
+ fillsize = wnd_scaled;
+
+ iov_vu[0].iov_base = tcp_buf_discard;
+ iov_vu[0].iov_len = already_sent;
+ fillsize -= already_sent;
+
+ has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
+ if (has_mrg_rxbuf) {
+ vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ } else {
+ vnet_hdrlen = sizeof(struct virtio_net_hdr);
+ }
+ l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);
+ if (v4) {
+ l2_hdrlen += sizeof(struct iphdr);
+ } else {
+ l2_hdrlen += sizeof(struct ipv6hdr);
+ }
+
+ iov_count = 0;
+ segment_size = 0;
+ while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
+ VuVirtqElement *elem;
+
+ elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
+ if (!elem)
+ break;
+
+ if (elem->in_num < 1) {
+ err("virtio-net receive queue contains no in buffers");
+ goto err;
+ }
+
+ ASSERT(elem->in_num == 1);
+ ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
+
+ indexes[iov_count] = elem->index;
+
+ if (segment_size == 0) {
+ iov_vu[iov_count + 1].iov_base =
+ (char *)elem->in_sg[0].iov_base + l2_hdrlen;
+ iov_vu[iov_count + 1].iov_len =
+ elem->in_sg[0].iov_len - l2_hdrlen;
+ } else {
+ iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
+ iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
+ }
+
+ if (iov_vu[iov_count + 1].iov_len > fillsize)
+ iov_vu[iov_count + 1].iov_len = fillsize;
+
+ segment_size += iov_vu[iov_count + 1].iov_len;
+ if (!has_mrg_rxbuf) {
+ segment_size = 0;
+ } else if (segment_size >= mss) {
+ iov_vu[iov_count + 1].iov_len -= segment_size - mss;
+ segment_size = 0;
+ }
+ fillsize -= iov_vu[iov_count + 1].iov_len;
+
+ iov_count++;
+ }
+ if (iov_count == 0)
+ return 0;
+
+ mh_sock.msg_iov = iov_vu;
+ mh_sock.msg_iovlen = iov_count + 1;
+
+ do
+ len = recvmsg(s, &mh_sock, MSG_PEEK);
+ while (len < 0 && errno == EINTR);
+
+ if (len < 0)
+ goto err;
+
+ if (!len) {
+ vu_queue_rewind(vdev, vq, iov_count);
+ if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+ if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
+ tcp_rst(c, conn);
+ return ret;
+ }
+
+ conn_event(c, conn, TAP_FIN_SENT);
+ }
+
+ return 0;
+ }
+
+ len -= already_sent;
+ if (len <= 0) {
+ conn_flag(c, conn, STALLED);
+ vu_queue_rewind(vdev, vq, iov_count);
+ return 0;
+ }
+
+ conn_flag(c, conn, ~STALLED);
+
+ /* Likely, some new data was acked too. */
+ tcp_update_seqack_wnd(c, conn, 0, NULL);
+
+ /* initialize headers */
+ iov_used = 0;
+ num_buffers = 0;
+ check = NULL;
+ segment_size = 0;
+ for (i = 0; i < iov_count && len; i++) {
+
+ if (segment_size == 0)
+ first = &iov_vu[i + 1];
+
+ if (iov_vu[i + 1].iov_len > (size_t)len)
+ iov_vu[i + 1].iov_len = len;
+
+ len -= iov_vu[i + 1].iov_len;
+ iov_used++;
+
+ segment_size += iov_vu[i + 1].iov_len;
+ num_buffers++;
+
+ if (segment_size >= mss || len == 0 ||
+ i + 1 == iov_count || !has_mrg_rxbuf) {
+
+ struct ethhdr *eh;
+ struct virtio_net_hdr_mrg_rxbuf *vh;
+ char *base = (char *)first->iov_base - l2_hdrlen;
+ size_t size = first->iov_len + l2_hdrlen;
+
+ vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
+
+ vh->hdr = vu_header;
+ if (has_mrg_rxbuf)
+ vh->num_buffers = htole16(num_buffers);
+
+ eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
+
+ memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
+ memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
+
+ /* initialize header */
+ if (v4) {
+ struct iphdr *iph = (struct iphdr *)(eh + 1);
+ struct tcphdr *th = (struct tcphdr *)(iph + 1);
+
+ eh->h_proto = htons(ETH_P_IP);
+
+ *th = (struct tcphdr){
+ .doff = sizeof(struct tcphdr) / 4,
+ .ack = 1
+ };
+
+ *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+
+ ipv4_fill_headers(c, conn, iph, segment_size,
+ len ? check : NULL, conn->seq_to_tap);
+
+ if (*c->pcap) {
+ uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
+
+ first->iov_base = th;
+ first->iov_len = size - l2_hdrlen + sizeof(*th);
+
+ th->check = csum_iov(first, num_buffers, sum);
+ }
+
+ check = &iph->check;
+ } else {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
+ struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
+
+ eh->h_proto = htons(ETH_P_IPV6);
+
+ *th = (struct tcphdr){
+ .doff = sizeof(struct tcphdr) / 4,
+ .ack = 1
+ };
+
+ *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+
+ ipv6_fill_headers(c, conn, ip6h, segment_size,
+ conn->seq_to_tap);
+ if (*c->pcap) {
+ uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
+
+ first->iov_base = th;
+ first->iov_len = size - l2_hdrlen + sizeof(*th);
+
+ th->check = csum_iov(first, num_buffers, sum);
+ }
+ }
+
+ /* set iov for pcap logging */
+ first->iov_base = eh;
+ first->iov_len = size - vnet_hdrlen;
+
+ pcap_iov(first, num_buffers);
+
+ /* set iov_len for vu_queue_fill_by_index(); */
+
+ first->iov_base = base;
+ first->iov_len = size;
+
+ conn->seq_to_tap += segment_size;
+
+ segment_size = 0;
+ num_buffers = 0;
+ }
+ }
+
+ /* release unused buffers */
+ vu_queue_rewind(vdev, vq, iov_count - iov_used);
+
+ /* send packets */
+ for (i = 0; i < iov_used; i++) {
+ vu_queue_fill_by_index(vdev, vq, indexes[i],
+ iov_vu[i + 1].iov_len, i);
+ }
+
+ vu_queue_flush(vdev, vq, iov_used);
+ vu_queue_notify(vdev, vq);
+
+ conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+ return 0;
+err:
+ vu_queue_rewind(vdev, vq, iov_count);
+
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
+ ret = -errno;
+ tcp_rst(c, conn);
+ }
+
+ return ret;
+}
diff --git a/tcp_vu.h b/tcp_vu.h
new file mode 100644
index 000000000000..8045a6e3edb8
--- /dev/null
+++ b/tcp_vu.h
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#ifndef TCP_VU_H
+#define TCP_VU_H
+
+uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn);
+int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
+
+#endif /*TCP_VU_H */
--
2.42.0