Signed-off-by: Laurent Vivier
---
Makefile | 7 +-
tcp.c | 618 ++-----------------------------------------------
tcp_buf.c | 569 +++++++++++++++++++++++++++++++++++++++++++++
tcp_buf.h | 17 ++
tcp_internal.h | 78 +++++++
5 files changed, 689 insertions(+), 600 deletions(-)
create mode 100644 tcp_buf.c
create mode 100644 tcp_buf.h
create mode 100644 tcp_internal.h
diff --git a/Makefile b/Makefile
index acf37f5a2036..bf370b6ec2e6 100644
--- a/Makefile
+++ b/Makefile
@@ -46,8 +46,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \
igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \
- passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c udp.c \
- util.c iov.c ip.c
+ passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \
+ tcp_buf.c udp.c util.c iov.c ip.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \
flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \
netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \
- tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h iov.h ip.h
+ tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \
+ util.h iov.h ip.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/tcp.c b/tcp.c
index 640209533772..54c15087d678 100644
--- a/tcp.c
+++ b/tcp.c
@@ -300,57 +300,19 @@
#include "flow.h"
#include "flow_table.h"
+#include "tcp_internal.h"
+#include "tcp_buf.h"
/* Sides of a flow as we use them in "tap" connections */
#define SOCKSIDE 0
#define TAPSIDE 1
-#define TCP_FRAMES_MEM 128
-#define TCP_FRAMES \
- (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
-
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
-#define MAX_WS 8
-#define MAX_WINDOW (1 << (16 + (MAX_WS)))
-
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
-struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
-#ifdef __AVX2__
- uint8_t pad[26];
-#else
- uint8_t pad[2];
-#endif
- struct tap_hdr taph;
- struct iphdr iph;
- struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
-#ifdef __AVX2__
- uint8_t pad[14];
-#else
- uint8_t pad[2];
-#endif
- struct tap_hdr taph;
- struct ipv6hdr ip6h;
- struct tcphdr th;
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
-#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
-
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
@@ -372,31 +334,9 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
*/
#define SOL_TCP IPPROTO_TCP
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
-
-#define FIN (1 << 0)
-#define SYN (1 << 1)
-#define RST (1 << 2)
-#define ACK (1 << 4)
-/* Flags for internal usage */
-#define DUP_ACK (1 << 5)
#define ACK_IF_NEEDED 0 /* See tcp_buf_send_flag() */
-#define OPT_EOL 0
-#define OPT_NOP 1
-#define OPT_MSS 2
-#define OPT_MSS_LEN 4
-#define OPT_WS 3
-#define OPT_WS_LEN 3
-#define OPT_SACKP 4
-#define OPT_SACK 5
-#define OPT_TS 8
-
-#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
-#define CONN_V6(conn) (!CONN_V4(conn))
+
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -433,144 +373,11 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq: Pointer to sequence number sent to tap-side, to be updated
- * @len: TCP payload length
- */
-struct tcp_buf_seq_update {
- uint32_t *seq;
- uint16_t len;
-};
-
-/* Static buffers */
-
-/**
- * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
- * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph: Tap-level headers (partially pre-filled)
- * @iph: Pre-filled IP header (except for tot_len and saddr)
- * @uh: Headroom for TCP header
- * @data: Storage for TCP payload
- */
-static struct tcp4_l2_buf_t {
-#ifdef __AVX2__
- uint8_t pad[26]; /* 0, align th to 32 bytes */
-#else
- uint8_t pad[2]; /* align iph to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 26 2 */
- struct iphdr iph; /* 44 20 */
- struct tcphdr th; /* 64 40 */
- uint8_t data[MSS4]; /* 84 60 */
- /* 65536 65532 */
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp4_l2_buf[TCP_FRAMES_MEM];
-
-static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_buf_used;
-
-/**
- * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
- * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph: Tap-level headers (partially pre-filled)
- * @ip6h: Pre-filled IP header (except for payload_len and addresses)
- * @th: Headroom for TCP header
- * @data: Storage for TCP payload
- */
-struct tcp6_l2_buf_t {
-#ifdef __AVX2__
- uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
-#else
- uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 14 2 */
- struct ipv6hdr ip6h; /* 32 20 */
- struct tcphdr th; /* 72 60 */
- uint8_t data[MSS6]; /* 92 80 */
- /* 65536 65532 */
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_buf[TCP_FRAMES_MEM];
-
-static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_l2_buf_used;
-
-/* recvmsg()/sendmsg() data for tap */
-static char tcp_buf_discard [MAX_WINDOW];
-static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
-
-static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
-static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
-static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
-static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
+char tcp_buf_discard [MAX_WINDOW];
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
-/**
- * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
- * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
- * @taph: Tap-level headers (partially pre-filled)
- * @iph: Pre-filled IP header (except for tot_len and saddr)
- * @th: Headroom for TCP header
- * @opts: Headroom for TCP options
- */
-static struct tcp4_l2_flags_buf_t {
-#ifdef __AVX2__
- uint8_t pad[26]; /* 0, align th to 32 bytes */
-#else
- uint8_t pad[2]; /* align iph to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 26 2 */
- struct iphdr iph; /* 44 20 */
- struct tcphdr th; /* 64 40 */
- char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp4_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_l2_flags_buf_used;
-
-/**
- * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
- * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
- * @taph: Tap-level headers (partially pre-filled)
- * @ip6h: Pre-filled IP header (except for payload_len and addresses)
- * @th: Headroom for TCP header
- * @opts: Headroom for TCP options
- */
-static struct tcp6_l2_flags_buf_t {
-#ifdef __AVX2__
- uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
-#else
- uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
-#endif
- struct tap_hdr taph; /* 14 2 */
- struct ipv6hdr ip6h; /* 32 20 */
- struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
- char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)))
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
-#endif
-tcp6_l2_flags_buf[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_l2_flags_buf_used;
-
#define CONN(idx) (&(FLOW(idx)->tcp))
/* Table for lookup from remote address, local port, remote port */
@@ -611,14 +418,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLRDHUP;
}
-static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
- unsigned long flag);
-#define conn_flag(c, conn, flag) \
- do { \
- flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
- conn_flag_do(c, conn, flag); \
- } while (0)
-
/**
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
* @c: Execution context
@@ -730,8 +529,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
- unsigned long flag)
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long flag)
{
if (flag & (flag - 1)) {
int flag_index = fls(~flag);
@@ -781,8 +580,8 @@ static void tcp_hash_remove(const struct ctx *c,
* @conn: Connection pointer
* @event: Connection event
*/
-static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
- unsigned long event)
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long event)
{
int prev, new, num = fls(event);
@@ -830,12 +629,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
}
-#define conn_event(c, conn, event) \
- do { \
- flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
- conn_event_do(c, conn, event); \
- } while (0)
-
/**
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
* @conn: Connection pointer
@@ -959,91 +752,6 @@ static uint16_t tcp_update_check_tcp6(struct ipv6hdr *ip6h)
return csum(th, ntohs(ip6h->payload_len), sum);
}
-/**
- * tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses
- * @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
- */
-void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s)
-{
- int i;
-
- for (i = 0; i < TCP_FRAMES_MEM; i++) {
- struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
- struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
- struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
- struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
-
- tap_update_mac(&b4->taph, eth_d, eth_s);
- tap_update_mac(&b6->taph, eth_d, eth_s);
- tap_update_mac(&b4f->taph, eth_d, eth_s);
- tap_update_mac(&b6f->taph, eth_d, eth_s);
- }
-}
-
-/**
- * tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
- * @c: Execution context
- */
-static void tcp_buf_sock4_iov_init(const struct ctx *c)
-{
- struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
- struct iovec *iov;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
- tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IP),
- .iph = iph,
- .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
- };
- }
-
- for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
- tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IP),
- .iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
- };
- }
-
- for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
-
- for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
-}
-
-/**
- * tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
- * @c: Execution context
- */
-static void tcp_buf_sock6_iov_init(const struct ctx *c)
-{
- struct iovec *iov;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
- tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IPV6),
- .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
- .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
- };
- }
-
- for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
- tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
- .taph = TAP_HDR_INIT(ETH_P_IPV6),
- .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
- };
- }
-
- for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
-
- for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
- iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
-}
-
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
* @opts: Pointer to start of TCP options in header
@@ -1269,46 +977,6 @@ bool tcp_flow_defer(union flow *flow)
return true;
}
-static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
-#define tcp_rst(c, conn) \
- do { \
- flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
- tcp_rst_do(c, conn); \
- } while (0)
-
-/**
- * tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags)
- * @c: Execution context
- */
-static void tcp_buf_l2_flags_flush(const struct ctx *c)
-{
- tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
- tcp6_l2_flags_buf_used = 0;
-
- tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
- tcp4_l2_flags_buf_used = 0;
-}
-
-/**
- * tcp_buf_l2_data_flush() - Send out buffers for segments with data
- * @c: Execution context
- */
-static void tcp_buf_l2_data_flush(const struct ctx *c)
-{
- unsigned i;
- size_t m;
-
- m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
- for (i = 0; i < m; i++)
- *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
- tcp6_l2_buf_used = 0;
-
- m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
- for (i = 0; i < m; i++)
- *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
- tcp4_l2_buf_used = 0;
-}
-
/**
* tcp_defer_handler() - Handler for TCP deferred tasks
* @c: Execution context
@@ -1348,10 +1016,10 @@ static void tcp_set_tcp_header(struct tcphdr *th,
* Return: IP frame length including L2 headers, host order
*/
-static size_t ipv4_fill_headers(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct iphdr *iph, size_t plen,
- const uint16_t *check, uint32_t seq)
+size_t ipv4_fill_headers(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct iphdr *iph, size_t plen,
+ const uint16_t *check, uint32_t seq)
{
struct tcphdr *th = (void *)(iph + 1);
const struct in_addr *a4 = inany_v4(&conn->faddr);
@@ -1382,10 +1050,10 @@ static size_t ipv4_fill_headers(const struct ctx *c,
* Return: IP frame length including L2 headers, host order
*/
-static size_t ipv6_fill_headers(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct ipv6hdr *ip6h, size_t plen,
- uint32_t seq)
+size_t ipv6_fill_headers(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct ipv6hdr *ip6h, size_t plen,
+ uint32_t seq)
{
struct tcphdr *th = (void *)(ip6h + 1);
size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
@@ -1423,8 +1091,8 @@ static size_t ipv6_fill_headers(const struct ctx *c,
*
* Return: 1 if sequence or window were updated, 0 otherwise
*/
-static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
- int force_seq, struct tcp_info *tinfo)
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+ int force_seq, struct tcp_info *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
@@ -1539,7 +1207,8 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* Return: negative error code on connection reset, 0 otherwise
*/
-static int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t optlen)
+int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+ struct tcphdr *th, char *data, size_t optlen)
{
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
uint32_t prev_wnd_to_tap = conn->wnd_to_tap;
@@ -1629,77 +1298,13 @@ static int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
return 1;
}
-static int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
-{
- size_t optlen = 0;
- struct iovec *iov;
- size_t ip_len;
- int ret;
-
- /* Options: MSS, NOP and window scale (8 bytes) */
- if (flags & SYN)
- optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
-
- if (CONN_V4(conn)) {
- struct tcp4_l2_flags_buf_t *b4;
-
- iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
- b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
-
- ret = do_tcp_send_flag(c, conn, flags, &b4->th, b4->opts,
- optlen);
- if (ret <= 0)
- return ret;
-
- ip_len = ipv4_fill_headers(c, conn, &b4->iph, optlen,
- NULL, conn->seq_to_tap);
-
- iov->iov_len = tap_iov_len(c, &b4->taph, ip_len);
-
- if (flags & DUP_ACK) {
-
- memcpy(b4 + 1, b4, sizeof(*b4));
- (iov + 1)->iov_len = iov->iov_len;
- tcp4_l2_flags_buf_used++;
- }
-
- if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
- tcp_buf_l2_flags_flush(c);
- } else {
- struct tcp6_l2_flags_buf_t *b6;
-
- iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
- b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
-
- ret = do_tcp_send_flag(c, conn, flags, &b6->th, b6->opts,
- optlen);
- if (ret <= 0)
- return ret;
-
- ip_len = ipv6_fill_headers(c, conn, &b6->ip6h, optlen,
- conn->seq_to_tap);
-
- iov->iov_len = tap_iov_len(c, &b6->taph, ip_len);
-
- if (flags & DUP_ACK) {
- memcpy(b6 + 1, b6, sizeof(*b6));
- (iov + 1)->iov_len = iov->iov_len;
- tcp6_l2_flags_buf_used++;
- }
-
- if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
- tcp_buf_l2_flags_flush(c);
- }
-
- return 0;
-}
/**
* tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1813,14 +1418,6 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
return s;
}
-static uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn)
-{
- if (CONN_V4(conn))
- return MSS4;
-
- return MSS6;
-}
-
/**
* tcp_conn_tap_mss() - Get MSS value advertised by tap/guest
* @conn: Connection pointer
@@ -2072,179 +1669,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
return 0;
}
-/**
- * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
- * @c: Execution context
- * @conn: Connection pointer
- * @plen: Payload length at L4
- * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
- * @seq: Sequence number to be sent
- */
-static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
- ssize_t plen, int no_csum, uint32_t seq)
-{
- uint32_t *seq_update = &conn->seq_to_tap;
- struct iovec *iov;
- size_t ip_len;
-
- if (CONN_V4(conn)) {
- struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
- const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
-
- tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
- tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
-
- ip_len = ipv4_fill_headers(c, conn, &b->iph, plen,
- check, seq);
-
- iov = tcp4_l2_iov + tcp4_l2_buf_used++;
- iov->iov_len = tap_iov_len(c, &b->taph, ip_len);
- if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
- tcp_buf_l2_data_flush(c);
- } else if (CONN_V6(conn)) {
- struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
-
- tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
- tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
-
- ip_len = ipv6_fill_headers(c, conn, &b->ip6h, plen, seq);
-
- iov = tcp6_l2_iov + tcp6_l2_buf_used++;
- iov->iov_len = tap_iov_len(c, &b->taph, ip_len);
- if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
- tcp_buf_l2_data_flush(c);
- }
-}
-
-/**
- * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
- * @c: Execution context
- * @conn: Connection pointer
- *
- * Return: negative on connection reset, 0 otherwise
- *
- * #syscalls recvmsg
- */
-static int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
-{
- uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
- int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
- int sendlen, len, plen, v4 = CONN_V4(conn);
- int s = conn->sock, i, ret = 0;
- struct msghdr mh_sock = { 0 };
- uint16_t mss = MSS_GET(conn);
- uint32_t already_sent, seq;
- struct iovec *iov;
-
- already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
- if (SEQ_LT(already_sent, 0)) {
- /* RFC 761, section 2.1. */
- flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
- conn->seq_ack_from_tap, conn->seq_to_tap);
- conn->seq_to_tap = conn->seq_ack_from_tap;
- already_sent = 0;
- }
-
- if (!wnd_scaled || already_sent >= wnd_scaled) {
- conn_flag(c, conn, STALLED);
- conn_flag(c, conn, ACK_FROM_TAP_DUE);
- return 0;
- }
-
- /* Set up buffer descriptors we'll fill completely and partially. */
- fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
- if (fill_bufs > TCP_FRAMES) {
- fill_bufs = TCP_FRAMES;
- iov_rem = 0;
- } else {
- iov_rem = (wnd_scaled - already_sent) % mss;
- }
-
- mh_sock.msg_iov = iov_sock;
- mh_sock.msg_iovlen = fill_bufs + 1;
-
- iov_sock[0].iov_base = tcp_buf_discard;
- iov_sock[0].iov_len = already_sent;
-
- if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
- (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
- tcp_buf_l2_data_flush(c);
-
- /* Silence Coverity CWE-125 false positive */
- tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
- }
-
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
- if (v4)
- iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
- else
- iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
- iov->iov_len = mss;
- }
- if (iov_rem)
- iov_sock[fill_bufs].iov_len = iov_rem;
-
- /* Receive into buffers, don't dequeue until acknowledged by guest. */
- do
- len = recvmsg(s, &mh_sock, MSG_PEEK);
- while (len < 0 && errno == EINTR);
-
- if (len < 0)
- goto err;
-
- if (!len) {
- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
- if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
- tcp_rst(c, conn);
- return ret;
- }
-
- conn_event(c, conn, TAP_FIN_SENT);
- }
-
- return 0;
- }
-
- sendlen = len - already_sent;
- if (sendlen <= 0) {
- conn_flag(c, conn, STALLED);
- return 0;
- }
-
- conn_flag(c, conn, ~STALLED);
-
- send_bufs = DIV_ROUND_UP(sendlen, mss);
- last_len = sendlen - (send_bufs - 1) * mss;
-
- /* Likely, some new data was acked too. */
- tcp_update_seqack_wnd(c, conn, 0, NULL);
-
- /* Finally, queue to tap */
- plen = mss;
- seq = conn->seq_to_tap;
- for (i = 0; i < send_bufs; i++) {
- int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
-
- if (i == send_bufs - 1)
- plen = last_len;
-
- tcp_data_to_tap(c, conn, plen, no_csum, seq);
- seq += plen;
- }
-
- conn_flag(c, conn, ACK_FROM_TAP_DUE);
-
- return 0;
-
-err:
- if (errno != EAGAIN && errno != EWOULDBLOCK) {
- ret = -errno;
- tcp_rst(c, conn);
- }
-
- return ret;
-}
/**
* tcp_data_from_tap() - tap/guest data for established connection
diff --git a/tcp_buf.c b/tcp_buf.c
new file mode 100644
index 000000000000..d70e7f810e4a
--- /dev/null
+++ b/tcp_buf.c
@@ -0,0 +1,569 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * tcp_buf.c - TCP L2-L4 translation state machine
+ *
+ * Copyright (c) 2020-2022 Red Hat GmbH
+ * Author: Stefano Brivio
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "tap.h"
+#include "siphash.h"
+#include "inany.h"
+#include "tcp_conn.h"
+#include "tcp_internal.h"
+#include "tcp_buf.h"
+
+#define TCP_FRAMES_MEM 128
+#define TCP_FRAMES \
+ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+
+struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
+#ifdef __AVX2__
+ uint8_t pad[26];
+#else
+ uint8_t pad[2];
+#endif
+ struct tap_hdr taph;
+ struct iphdr iph;
+ struct tcphdr th;
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
+#ifdef __AVX2__
+ uint8_t pad[14];
+#else
+ uint8_t pad[2];
+#endif
+ struct tap_hdr taph;
+ struct ipv6hdr ip6h;
+ struct tcphdr th;
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
+#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)
+
+/**
+ * tcp_buf_seq_update - Sequences to update with length of frames once sent
+ * @seq: Pointer to sequence number sent to tap-side, to be updated
+ * @len: TCP payload length
+ */
+struct tcp_buf_seq_update {
+ uint32_t *seq;
+ uint16_t len;
+};
+
+/* Static buffers */
+
+/**
+ * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
+ * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
+ * @taph: Tap-level headers (partially pre-filled)
+ * @iph: Pre-filled IP header (except for tot_len and saddr)
+ * @uh: Headroom for TCP header
+ * @data: Storage for TCP payload
+ */
+static struct tcp4_l2_buf_t {
+#ifdef __AVX2__
+ uint8_t pad[26]; /* 0, align th to 32 bytes */
+#else
+ uint8_t pad[2]; /* align iph to 4 bytes 0 */
+#endif
+ struct tap_hdr taph; /* 26 2 */
+ struct iphdr iph; /* 44 20 */
+ struct tcphdr th; /* 64 40 */
+ uint8_t data[MSS4]; /* 84 60 */
+ /* 65536 65532 */
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)))
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+#endif
+tcp4_l2_buf[TCP_FRAMES_MEM];
+
+static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
+
+static unsigned int tcp4_l2_buf_used;
+
+/**
+ * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
+ * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
+ * @taph: Tap-level headers (partially pre-filled)
+ * @ip6h: Pre-filled IP header (except for payload_len and addresses)
+ * @th: Headroom for TCP header
+ * @data: Storage for TCP payload
+ */
+struct tcp6_l2_buf_t {
+#ifdef __AVX2__
+ uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
+#else
+ uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
+#endif
+ struct tap_hdr taph; /* 14 2 */
+ struct ipv6hdr ip6h; /* 32 20 */
+ struct tcphdr th; /* 72 60 */
+ uint8_t data[MSS6]; /* 92 80 */
+ /* 65536 65532 */
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)))
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+#endif
+tcp6_l2_buf[TCP_FRAMES_MEM];
+
+static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_l2_buf_used;
+
+/* recvmsg()/sendmsg() data for tap */
+static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
+
+static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM];
+static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM];
+static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM];
+static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM];
+
+/**
+ * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
+ * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only
+ * @taph: Tap-level headers (partially pre-filled)
+ * @iph: Pre-filled IP header (except for tot_len and saddr)
+ * @th: Headroom for TCP header
+ * @opts: Headroom for TCP options
+ */
+static struct tcp4_l2_flags_buf_t {
+#ifdef __AVX2__
+ uint8_t pad[26]; /* 0, align th to 32 bytes */
+#else
+ uint8_t pad[2]; /* align iph to 4 bytes 0 */
+#endif
+ struct tap_hdr taph; /* 26 2 */
+ struct iphdr iph; /* 44 20 */
+ struct tcphdr th; /* 64 40 */
+ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)))
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+#endif
+tcp4_l2_flags_buf[TCP_FRAMES_MEM];
+
+static unsigned int tcp4_l2_flags_buf_used;
+
+/**
+ * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
+ * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
+ * @taph: Tap-level headers (partially pre-filled)
+ * @ip6h: Pre-filled IP header (except for payload_len and addresses)
+ * @th: Headroom for TCP header
+ * @opts: Headroom for TCP options
+ */
+static struct tcp6_l2_flags_buf_t {
+#ifdef __AVX2__
+ uint8_t pad[14]; /* 0 align ip6h to 32 bytes */
+#else
+ uint8_t pad[2]; /* align ip6h to 4 bytes 0 */
+#endif
+ struct tap_hdr taph; /* 14 2 */
+ struct ipv6hdr ip6h; /* 32 20 */
+ struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */
+ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)))
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
+#endif
+tcp6_l2_flags_buf[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_l2_flags_buf_used;
+
+/**
+ * tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses
+ * @eth_d: Ethernet destination address, NULL if unchanged
+ * @eth_s: Ethernet source address, NULL if unchanged
+ */
+void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s)
+{
+ int i;
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i];
+ struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i];
+ struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i];
+ struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i];
+
+ tap_update_mac(&b4->taph, eth_d, eth_s);
+ tap_update_mac(&b6->taph, eth_d, eth_s);
+ tap_update_mac(&b4f->taph, eth_d, eth_s);
+ tap_update_mac(&b6f->taph, eth_d, eth_s);
+ }
+}
+
+/**
+ * tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
+ * @c: Execution context
+ */
+void tcp_buf_sock4_iov_init(const struct ctx *c)
+{
+ struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
+ struct iovec *iov;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) {
+ tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) {
+ .taph = TAP_HDR_INIT(ETH_P_IP),
+ .iph = iph,
+ .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
+ };
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) {
+ tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) {
+ .taph = TAP_HDR_INIT(ETH_P_IP),
+ .iph = L2_BUF_IP4_INIT(IPPROTO_TCP)
+ };
+ }
+
+ for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
+ iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph);
+
+ for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
+ iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph);
+}
+
+/**
+ * tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
+ * @c: Execution context
+ */
+void tcp_buf_sock6_iov_init(const struct ctx *c)
+{
+ struct iovec *iov;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) {
+ tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) {
+ .taph = TAP_HDR_INIT(ETH_P_IPV6),
+ .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP),
+ .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 }
+ };
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) {
+ tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) {
+ .taph = TAP_HDR_INIT(ETH_P_IPV6),
+ .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP)
+ };
+ }
+
+ for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++)
+ iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph);
+
+ for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++)
+ iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph);
+}
+
+/**
+ * tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags)
+ * @c: Execution context
+ */
+void tcp_buf_l2_flags_flush(const struct ctx *c)
+{
+ tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used);
+ tcp6_l2_flags_buf_used = 0;
+
+ tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used);
+ tcp4_l2_flags_buf_used = 0;
+}
+
+/**
+ * tcp_buf_l2_data_flush() - Send out buffers for segments with data
+ * @c: Execution context
+ */
+void tcp_buf_l2_data_flush(const struct ctx *c)
+{
+ unsigned i;
+ size_t m;
+
+ m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+ for (i = 0; i < m; i++)
+ *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
+ tcp6_l2_buf_used = 0;
+
+ m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+ for (i = 0; i < m; i++)
+ *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
+ tcp4_l2_buf_used = 0;
+}
+
+int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+ size_t optlen = 0;
+ struct iovec *iov;
+ size_t ip_len;
+ int ret;
+
+ /* Options: MSS, NOP and window scale (8 bytes) */
+ if (flags & SYN)
+ optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
+
+ if (CONN_V4(conn)) {
+ struct tcp4_l2_flags_buf_t *b4;
+
+ iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used;
+ b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++;
+
+ ret = do_tcp_send_flag(c, conn, flags, &b4->th, b4->opts,
+ optlen);
+ if (ret <= 0)
+ return ret;
+
+ ip_len = ipv4_fill_headers(c, conn, &b4->iph, optlen,
+ NULL, conn->seq_to_tap);
+
+ iov->iov_len = tap_iov_len(c, &b4->taph, ip_len);
+
+ if (flags & DUP_ACK) {
+
+ memcpy(b4 + 1, b4, sizeof(*b4));
+ (iov + 1)->iov_len = iov->iov_len;
+ tcp4_l2_flags_buf_used++;
+ }
+
+ if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2)
+ tcp_buf_l2_flags_flush(c);
+ } else {
+ struct tcp6_l2_flags_buf_t *b6;
+
+ iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used;
+ b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++;
+
+ ret = do_tcp_send_flag(c, conn, flags, &b6->th, b6->opts,
+ optlen);
+ if (ret <= 0)
+ return ret;
+
+ ip_len = ipv6_fill_headers(c, conn, &b6->ip6h, optlen,
+ conn->seq_to_tap);
+
+ iov->iov_len = tap_iov_len(c, &b6->taph, ip_len);
+
+ if (flags & DUP_ACK) {
+ memcpy(b6 + 1, b6, sizeof(*b6));
+ (iov + 1)->iov_len = iov->iov_len;
+ tcp6_l2_flags_buf_used++;
+ }
+
+ if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2)
+ tcp_buf_l2_flags_flush(c);
+ }
+
+ return 0;
+}
+
+uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn)
+{
+ if (CONN_V4(conn))
+ return MSS4;
+
+ return MSS6;
+}
+
+/**
+ * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @plen: Payload length at L4
+ * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
+ * @seq: Sequence number to be sent
+ */
+static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ ssize_t plen, int no_csum, uint32_t seq)
+{
+ uint32_t *seq_update = &conn->seq_to_tap;
+ struct iovec *iov;
+ size_t ip_len;
+
+ if (CONN_V4(conn)) {
+ struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
+ const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
+
+ tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
+ tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
+
+ ip_len = ipv4_fill_headers(c, conn, &b->iph, plen,
+ check, seq);
+
+ iov = tcp4_l2_iov + tcp4_l2_buf_used++;
+ iov->iov_len = tap_iov_len(c, &b->taph, ip_len);
+ if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1)
+ tcp_buf_l2_data_flush(c);
+ } else if (CONN_V6(conn)) {
+ struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
+
+ tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
+ tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
+
+ ip_len = ipv6_fill_headers(c, conn, &b->ip6h, plen, seq);
+
+ iov = tcp6_l2_iov + tcp6_l2_buf_used++;
+ iov->iov_len = tap_iov_len(c, &b->taph, ip_len);
+ if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1)
+ tcp_buf_l2_data_flush(c);
+ }
+}
+
+/**
+ * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: negative on connection reset, 0 otherwise
+ *
+ * #syscalls recvmsg
+ */
+int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+ int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
+ int sendlen, len, plen, v4 = CONN_V4(conn);
+ int s = conn->sock, i, ret = 0;
+ struct msghdr mh_sock = { 0 };
+ uint16_t mss = MSS_GET(conn);
+ uint32_t already_sent, seq;
+ struct iovec *iov;
+
+ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
+
+ if (SEQ_LT(already_sent, 0)) {
+ /* RFC 761, section 2.1. */
+ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ already_sent = 0;
+ }
+
+ if (!wnd_scaled || already_sent >= wnd_scaled) {
+ conn_flag(c, conn, STALLED);
+ conn_flag(c, conn, ACK_FROM_TAP_DUE);
+ return 0;
+ }
+
+ /* Set up buffer descriptors we'll fill completely and partially. */
+ fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
+ if (fill_bufs > TCP_FRAMES) {
+ fill_bufs = TCP_FRAMES;
+ iov_rem = 0;
+ } else {
+ iov_rem = (wnd_scaled - already_sent) % mss;
+ }
+
+ mh_sock.msg_iov = iov_sock;
+ mh_sock.msg_iovlen = fill_bufs + 1;
+
+ iov_sock[0].iov_base = tcp_buf_discard;
+ iov_sock[0].iov_len = already_sent;
+
+ if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) ||
+ (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) {
+ tcp_buf_l2_data_flush(c);
+
+ /* Silence Coverity CWE-125 false positive */
+ tcp4_l2_buf_used = tcp6_l2_buf_used = 0;
+ }
+
+ for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+ if (v4)
+ iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data;
+ else
+ iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data;
+ iov->iov_len = mss;
+ }
+ if (iov_rem)
+ iov_sock[fill_bufs].iov_len = iov_rem;
+
+ /* Receive into buffers, don't dequeue until acknowledged by guest. */
+ do
+ len = recvmsg(s, &mh_sock, MSG_PEEK);
+ while (len < 0 && errno == EINTR);
+
+ if (len < 0)
+ goto err;
+
+ if (!len) {
+ if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+ if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
+ tcp_rst(c, conn);
+ return ret;
+ }
+
+ conn_event(c, conn, TAP_FIN_SENT);
+ }
+
+ return 0;
+ }
+
+ sendlen = len - already_sent;
+ if (sendlen <= 0) {
+ conn_flag(c, conn, STALLED);
+ return 0;
+ }
+
+ conn_flag(c, conn, ~STALLED);
+
+ send_bufs = DIV_ROUND_UP(sendlen, mss);
+ last_len = sendlen - (send_bufs - 1) * mss;
+
+ /* Likely, some new data was acked too. */
+ tcp_update_seqack_wnd(c, conn, 0, NULL);
+
+ /* Finally, queue to tap */
+ plen = mss;
+ seq = conn->seq_to_tap;
+ for (i = 0; i < send_bufs; i++) {
+ int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
+
+ if (i == send_bufs - 1)
+ plen = last_len;
+
+ tcp_data_to_tap(c, conn, plen, no_csum, seq);
+ seq += plen;
+ }
+
+ conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+ return 0;
+
+err:
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
+ ret = -errno;
+ tcp_rst(c, conn);
+ }
+
+ return ret;
+}
diff --git a/tcp_buf.h b/tcp_buf.h
new file mode 100644
index 000000000000..d23031252002
--- /dev/null
+++ b/tcp_buf.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio
+ */
+
+#ifndef TCP_BUF_H
+#define TCP_BUF_H
+
+void tcp_buf_sock4_iov_init(const struct ctx *c);
+void tcp_buf_sock6_iov_init(const struct ctx *c);
+void tcp_buf_l2_flags_flush(const struct ctx *c);
+void tcp_buf_l2_data_flush(const struct ctx *c);
+uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn);
+int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+
+#endif /*TCP_BUF_H */
diff --git a/tcp_internal.h b/tcp_internal.h
new file mode 100644
index 000000000000..36eb2463dd5a
--- /dev/null
+++ b/tcp_internal.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio
+ */
+
+#ifndef TCP_INTERNAL_H
+#define TCP_INTERNAL_H
+
+#define MAX_WS 8
+#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+
+#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
+#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
+#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
+
+#define FIN (1 << 0)
+#define SYN (1 << 1)
+#define RST (1 << 2)
+#define ACK (1 << 4)
+
+/* Flags for internal usage */
+#define DUP_ACK (1 << 5)
+#define OPT_EOL 0
+#define OPT_NOP 1
+#define OPT_MSS 2
+#define OPT_MSS_LEN 4
+#define OPT_WS 3
+#define OPT_WS_LEN 3
+#define OPT_SACKP 4
+#define OPT_SACK 5
+#define OPT_TS 8
+
+#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn) (!CONN_V4(conn))
+
+extern char tcp_buf_discard [MAX_WINDOW];
+
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long flag);
+#define conn_flag(c, conn, flag) \
+ do { \
+ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
+ conn_flag_do(c, conn, flag); \
+ } while (0)
+
+
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long event);
+#define conn_event(c, conn, event) \
+ do { \
+ flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
+ conn_event_do(c, conn, event); \
+ } while (0)
+
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+#define tcp_rst(c, conn) \
+ do { \
+ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
+ tcp_rst_do(c, conn); \
+ } while (0)
+
+
+size_t ipv4_fill_headers(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct iphdr *iph, size_t plen,
+ const uint16_t *check, uint32_t seq);
+size_t ipv6_fill_headers(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct ipv6hdr *ip6h, size_t plen,
+ uint32_t seq);
+
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+ int force_seq, struct tcp_info *tinfo);
+int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+ struct tcphdr *th, char *data, size_t optlen);
+
+#endif /* TCP_INTERNAL_H */
--
2.42.0