Extract buffers management code from tcp.c and move it to tcp_buf.c tcp.c keeps all the generic code and will be also used by the vhost-user functions. Also compare mode to MODE_PASTA, as we will manage vhost-user mode (MODE_VU) like MODE_PASST. v4: - remove "tcp: extract buffer management from tcp_conn_tap_mss()" as the MSS size can be the same between socket and vhost-user. - rename tcp_send_flag() and tcp_data_from_sock() to tcp_buf_send_flag() and tcp_buf_data_from_sock() v3: - add 3 new patches tap: use in->buf_size rather than sizeof(pkt_buf) tcp: remove tap_hdr parameter iov: remove iov_copy() v2: - compare to MODE_PASTA in conf_open_files() too - move taph out of udp_update_hdr4()/udp_update_hdr6() Laurent Vivier (10): tcp: inline tcp_l2_buf_fill_headers() tcp: extract buffer management from tcp_send_flag() tcp: move buffers management functions to their own file tap: export pool_flush()/tapX_handler()/packet_add() udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX() udp: rename udp_sock_handler() to udp_buf_sock_handler() vhost-user: compare mode MODE_PASTA and not MODE_PASST iov: remove iov_copy() tcp: remove tap_hdr parameter tap: use in->buf_size rather than sizeof(pkt_buf) Makefile | 5 +- conf.c | 14 +- iov.c | 39 ---- iov.h | 3 - isolation.c | 10 +- passt.c | 4 +- tap.c | 119 +++++----- tap.h | 7 + tcp.c | 604 +++---------------------------------------------- tcp_buf.c | 536 +++++++++++++++++++++++++++++++++++++++++++ tcp_buf.h | 16 ++ tcp_internal.h | 85 +++++++ udp.c | 68 +++--- udp.h | 2 +- 14 files changed, 792 insertions(+), 720 deletions(-) create mode 100644 tcp_buf.c create mode 100644 tcp_buf.h create mode 100644 tcp_internal.h -- 2.44.0
It only calls tcp_fill_headers4() and tcp_fill_headers6() according to the connection IP version. We can inline them in tcp_data_to_tap() that already has a switch on the IP version. In tcp_send_flag(), it will ease to separate code from the common part and the buffer/vhost-user parts. Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 54 +++++++++++++++++++----------------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/tcp.c b/tcp.c index 06acb41e4d90..6f221995f3bc 100644 --- a/tcp.c +++ b/tcp.c @@ -1401,37 +1401,6 @@ static size_t tcp_fill_headers6(const struct ctx *c, return l4len; } -/** - * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers - * @c: Execution context - * @conn: Connection pointer - * @iov: Pointer to an array of iovec of TCP pre-cooked buffers - * @dlen: TCP payload length - * @check: Checksum, if already known - * @seq: Sequence number for this segment - * - * Return: IP payload length, host order - */ -static size_t tcp_l2_buf_fill_headers(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq) -{ - const struct in_addr *a4 = inany_v4(&conn->faddr); - - if (a4) { - return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - check, seq); - } - - return tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - seq); -} - /** * tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap * @c: Execution context @@ -1642,8 +1611,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, - conn->seq_to_tap); + if (CONN_V4(conn)) { + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + NULL, conn->seq_to_tap); + } else { + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + conn->seq_to_tap); + } iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (th->ack) { @@ -2146,7 +2124,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp4_seq_update[tcp4_payload_used].len = dlen; iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + check, seq); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); @@ -2155,7 +2136,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp6_seq_update[tcp6_payload_used].len = dlen; iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq); + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + seq); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp6_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); -- 2.44.0
Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 87 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/tcp.c b/tcp.c index 6f221995f3bc..a6f43010f58f 100644 --- a/tcp.c +++ b/tcp.c @@ -1518,24 +1518,25 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** - * tcp_send_flag() - Send segment with flags to tap (no payload) + * tcp_fill_flag_header() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @th: TCP header to update + * @data: buffer to store TCP option + * @optlen: size of the TCP option buffer * - * Return: negative error code on connection reset, 0 otherwise + * Return: < 0 error code on connection reset, + * 0 if there is no flag to send + * 1 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, char *data, + size_t *optlen) { - struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; - size_t optlen = 0; - struct tcphdr *th; - struct iovec *iov; - size_t l4len; - char *data; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1557,20 +1558,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) return 0; - if (CONN_V4(conn)) - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - th = &payload->th; - data = payload->opts; - if (flags & SYN) { int mss; /* Options: MSS, NOP and window scale (8 bytes) */ - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; + *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; *data++ = OPT_MSS; *data++ = OPT_MSS_LEN; @@ -1604,26 +1596,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) flags |= ACK; } - th->doff = (sizeof(*th) + optlen) / 4; + th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - if (CONN_V4(conn)) { - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - NULL, conn->seq_to_tap); - } else { - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - conn->seq_to_tap); - } - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) conn_flag(c, conn, ~ACK_TO_TAP_DUE); @@ -1638,6 +1617,48 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + return 1; +} + +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + size_t optlen = 0; + struct iovec *iov; + size_t l4len; + int ret; + + if (CONN_V4(conn)) { + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + NULL, conn->seq_to_tap); + } else { + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + conn->seq_to_tap); + } + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (flags & DUP_ACK) { struct iovec *dup_iov; int i; -- 2.44.0
On Fri, May 31, 2024 at 04:23:36PM +0200, Laurent Vivier wrote: I think this needs a commit message. In particular expanding on what you mean by "buffer management" would be helpful.Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 87 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/tcp.c b/tcp.c index 6f221995f3bc..a6f43010f58f 100644 --- a/tcp.c +++ b/tcp.c @@ -1518,24 +1518,25 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** - * tcp_send_flag() - Send segment with flags to tap (no payload) + * tcp_fill_flag_header() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @th: TCP header to update + * @data: buffer to store TCP option + * @optlen: size of the TCP option buffer * - * Return: negative error code on connection reset, 0 otherwise + * Return: < 0 error code on connection reset, + * 0 if there is no flag to send + * 1 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, char *data, + size_t *optlen) { - struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; - size_t optlen = 0; - struct tcphdr *th; - struct iovec *iov; - size_t l4len; - char *data; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1557,20 +1558,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) return 0; - if (CONN_V4(conn)) - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - th = &payload->th; - data = payload->opts; - if (flags & SYN) { int mss; /* Options: MSS, NOP and window scale (8 bytes) */ - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; + *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; *data++ = OPT_MSS; *data++ = OPT_MSS_LEN; @@ -1604,26 +1596,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) flags |= ACK; } - th->doff = (sizeof(*th) + optlen) / 4; + th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - if (CONN_V4(conn)) { - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - NULL, conn->seq_to_tap); - } else { - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - conn->seq_to_tap); - } - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) conn_flag(c, conn, ~ACK_TO_TAP_DUE); @@ -1638,6 +1617,48 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + return 1; +} + +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + size_t optlen = 0; + struct iovec *iov; + size_t l4len; + int ret; + + if (CONN_V4(conn)) { + iov = tcp4_l2_flags_iov[tcp4_flags_used++];Hrm.. it does occur to me that if you avoided the previous patch, keeping a single dispatcher for filling in the IP headers, then setting the iov pointer would become the only v4/v6 dependent part of this function, helping us down the road of unifying that more. Obviously there would be a separate CONN_v4() check hidden inside fill_headers(), but as I've said I'm already aiming to replace those with more specific tests.+ payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + NULL, conn->seq_to_tap); + } else { + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + conn->seq_to_tap); + } + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (flags & DUP_ACK) { struct iovec *dup_iov; int i;-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 5 +- tcp.c | 575 ++----------------------------------------------- tcp_buf.c | 526 ++++++++++++++++++++++++++++++++++++++++++++ tcp_buf.h | 16 ++ tcp_internal.h | 87 ++++++++ 5 files changed, 652 insertions(+), 557 deletions(-) create mode 100644 tcp_buf.c create mode 100644 tcp_buf.h create mode 100644 tcp_internal.h diff --git a/Makefile b/Makefile index 8ea175762e36..1ac2e5e0053f 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h + siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ + udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/tcp.c b/tcp.c index a6f43010f58f..48d8f7c6d696 100644 --- a/tcp.c +++ b/tcp.c @@ -302,28 +302,14 @@ #include "flow.h" #include "flow_table.h" - -#define TCP_FRAMES_MEM 128 -#define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) +#include "tcp_internal.h" +#include "tcp_buf.h" #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) -#define MAX_WS 8 -#define MAX_WINDOW (1 << (16 + (MAX_WS))) - /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 -#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ - sizeof(struct tcphdr) - \ - sizeof(struct iphdr), \ - sizeof(uint32_t)) -#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ - sizeof(struct tcphdr) - \ - sizeof(struct ipv6hdr), \ - sizeof(uint32_t)) - #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND # define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd) @@ -345,33 +331,10 @@ */ #define SOL_TCP IPPROTO_TCP -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) - -#define FIN (1 << 0) -#define SYN (1 << 1) -#define RST (1 << 2) -#define ACK (1 << 4) -/* Flags for internal usage */ -#define DUP_ACK (1 << 5) #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ -#define OPT_EOL 0 -#define OPT_NOP 1 -#define OPT_MSS 2 -#define OPT_MSS_LEN 4 -#define OPT_WS 3 -#define OPT_WS_LEN 3 -#define OPT_SACKP 4 -#define OPT_SACK 5 -#define OPT_TS 8 - #define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) -#define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -408,114 +371,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -/** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length - */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; -}; - -/* Static buffers */ -/** - * struct tcp_payload_t - TCP header and data to send segments with payload - * @th: TCP header - * @data: TCP data - */ -struct tcp_payload_t { - struct tcphdr th; - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/** - * struct tcp_flags_t - TCP header and data to send zero-length - * segments (flags) - * @th: TCP header - * @opts TCP options - */ -struct tcp_flags_t { - struct tcphdr th; - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/* Ethernet header for IPv4 frames */ -static struct ethhdr tcp4_eth_src; - -static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers */ -static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; -/* TCP segments with payload for IPv4 frames */ -static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; - -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); - -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; -static unsigned int tcp4_payload_used; - -static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers for TCP segment without payload */ -static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; -/* TCP segments without payload for IPv4 frames */ -static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp4_flags_used; - -/* Ethernet header for IPv6 frames */ -static struct ethhdr tcp6_eth_src; - -static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers */ -static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; -/* TCP headers and data for IPv6 frames */ -static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; - -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); - -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; -static unsigned int tcp6_payload_used; - -static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers for TCP segment without payload */ -static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; -/* TCP segment without payload for IPv6 frames */ -static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp6_flags_used; - -/* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; - -/* - * enum tcp_iov_parts - I/O vector parts for one TCP frame - * @TCP_IOV_TAP tap backend specific header - * @TCP_IOV_ETH Ethernet header - * @TCP_IOV_IP IP (v4/v6) header - * @TCP_IOV_PAYLOAD IP payload (TCP header + data) - * @TCP_NUM_IOVS the number of entries in the iovec array - */ -enum tcp_iov_parts { - TCP_IOV_TAP = 0, - TCP_IOV_ETH = 1, - TCP_IOV_IP = 2, - TCP_IOV_PAYLOAD = 3, - TCP_NUM_IOVS -}; - -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +char tcp_buf_discard [MAX_WINDOW]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -560,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLRDHUP; } -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag); -#define conn_flag(c, conn, flag) \ - do { \ - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ - } while (0) - /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events * @c: Execution context @@ -679,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag) +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag) { if (flag & (flag - 1)) { int flag_index = fls(~flag); @@ -730,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c, * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event) +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event) { int prev, new, num = fls(event); @@ -779,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -#define conn_event(c, conn, event) \ - do { \ - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ - conn_event_do(c, conn, event); \ - } while (0) - /** * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint * @conn: Connection pointer @@ -914,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) th->check = csum(th, l4len, sum); } -/** - * tcp_update_l2_buf() - Update Ethernet header buffers with addresses - * @eth_d: Ethernet destination address, NULL if unchanged - * @eth_s: Ethernet source address, NULL if unchanged - */ -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) -{ - eth_update_mac(&tcp4_eth_src, eth_d, eth_s); - eth_update_mac(&tcp6_eth_src, eth_d, eth_s); -} - -/** - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets - * @c: Execution context - */ -static void tcp_sock4_iov_init(const struct ctx *c) -{ - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); - - for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { - tcp4_payload_ip[i] = iph; - tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { - tcp4_flags_ip[i] = iph; - tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_flags[i].th.ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; - } -} - -/** - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets - * @c: Execution context - */ -static void tcp_sock6_iov_init(const struct ctx *c) -{ - struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); - - for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { - tcp6_payload_ip[i] = ip6; - tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { - tcp6_flags_ip[i] = ip6; - tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_flags[i].th .ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; - } -} - /** * tcp_opt_get() - Get option, and value if any, from TCP header * @opts: Pointer to start of TCP options in header @@ -1235,50 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn) return true; } -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); -#define tcp_rst(c, conn) \ - do { \ - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ - tcp_rst_do(c, conn); \ - } while (0) - -/** - * tcp_flags_flush() - Send out buffers for segments with no data (flags) - * @c: Execution context - */ -static void tcp_flags_flush(const struct ctx *c) -{ - tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp6_flags_used); - tcp6_flags_used = 0; - - tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp4_flags_used); - tcp4_flags_used = 0; -} - -/** - * tcp_payload_flush() - Send out buffers for segments with data - * @c: Execution context - */ -static void tcp_payload_flush(const struct ctx *c) -{ - unsigned i; - size_t m; - - m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, - tcp6_payload_used); - for (i = 0; i < m; i++) - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; - tcp6_payload_used = 0; - - m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, - tcp4_payload_used); - for (i = 0; i < m; i++) - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; - tcp4_payload_used = 0; -} - /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context @@ -1326,7 +1026,7 @@ static void tcp_fill_header(struct tcphdr *th, * * Return: The IPv4 payload length, host order */ -static size_t tcp_fill_headers4(const struct ctx *c, +size_t tcp_fill_headers4(const struct ctx *c, const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, @@ -1369,11 +1069,11 @@ static size_t tcp_fill_headers4(const struct ctx *c, * * Return: The IPv6 payload length, host order */ -static size_t tcp_fill_headers6(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcphdr *th, - size_t dlen, uint32_t seq) +size_t tcp_fill_headers6(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct ipv6hdr *ip6h, struct tcphdr *th, + size_t dlen, uint32_t seq) { size_t l4len = dlen + sizeof(*th); @@ -1410,8 +1110,8 @@ static size_t tcp_fill_headers6(const struct ctx *c, * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; @@ -1530,7 +1230,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * 0 if there is no flag to send * 1 otherwise */ -static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen) { @@ -1620,69 +1320,9 @@ static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, return 1; } -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { - struct tcp_flags_t *payload; - size_t optlen = 0; - struct iovec *iov; - size_t l4len; - int ret; - - if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - - ret = tcp_fill_flag_header(c, conn, flags, &payload->th, - payload->opts, &optlen); - if (ret <= 0) - return ret; - - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - NULL, conn->seq_to_tap); - } else { - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - - ret = tcp_fill_flag_header(c, conn, flags, &payload->th, - payload->opts, &optlen); - if (ret <= 0) - return ret; - - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - conn->seq_to_tap); - } - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - - if (flags & DUP_ACK) { - struct iovec *dup_iov; - int i; - - if (CONN_V4(conn)) - dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - for (i = 0; i < TCP_NUM_IOVS; i++) - memcpy(dup_iov[i].iov_base, iov[i].iov_base, - iov[i].iov_len); - dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; - } - - if (CONN_V4(conn)) { - if (tcp4_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } else { - if (tcp6_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } - - return 0; + return tcp_buf_send_flag(c, conn, flags); } /** @@ -1690,7 +1330,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -2117,184 +1757,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) return 0; } -/** - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer - * @c: Execution context - * @conn: Connection pointer - * @dlen: TCP payload length - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer - * @seq: Sequence number to be sent - */ -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t dlen, int no_csum, uint32_t seq) -{ - uint32_t *seq_update = &conn->seq_to_tap; - struct iovec *iov; - size_t l4len; - - if (CONN_V4(conn)) { - struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; - const uint16_t *check = NULL; - - if (no_csum) { - struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; - check = &iph->check; - } - - tcp4_seq_update[tcp4_payload_used].seq = seq_update; - tcp4_seq_update[tcp4_payload_used].len = dlen; - - iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - check, seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp4_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); - } else if (CONN_V6(conn)) { - tcp6_seq_update[tcp6_payload_used].seq = seq_update; - tcp6_seq_update[tcp6_payload_used].len = dlen; - - iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp6_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); - } -} - -/** - * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window - * @c: Execution context - * @conn: Connection pointer - * - * Return: negative on connection reset, 0 otherwise - * - * #syscalls recvmsg - */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int sendlen, len, dlen, v4 = CONN_V4(conn); - int s = conn->sock, i, ret = 0; - struct msghdr mh_sock = { 0 }; - uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; - struct iovec *iov; - - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - - if (SEQ_LT(already_sent, 0)) { - /* RFC 761, section 2.1. */ - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", - conn->seq_ack_from_tap, conn->seq_to_tap); - conn->seq_to_tap = conn->seq_ack_from_tap; - already_sent = 0; - } - - if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); - return 0; - } - - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); - if (fill_bufs > TCP_FRAMES) { - fill_bufs = TCP_FRAMES; - iov_rem = 0; - } else { - iov_rem = (wnd_scaled - already_sent) % mss; - } - - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - - if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || - (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { - tcp_payload_flush(c); - - /* Silence Coverity CWE-125 false positive */ - tcp4_payload_used = tcp6_payload_used = 0; - } - - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - if (v4) - iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; - else - iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; - iov->iov_len = mss; - } - if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; - - /* Receive into buffers, don't dequeue until acknowledged by guest. */ - do - len = recvmsg(s, &mh_sock, MSG_PEEK); - while (len < 0 && errno == EINTR); - - if (len < 0) - goto err; - - if (!len) { - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_send_flag(c, conn, FIN | ACK))) { - tcp_rst(c, conn); - return ret; - } - - conn_event(c, conn, TAP_FIN_SENT); - } - - return 0; - } - - sendlen = len - already_sent; - if (sendlen <= 0) { - conn_flag(c, conn, STALLED); - return 0; - } - - conn_flag(c, conn, ~STALLED); - - send_bufs = DIV_ROUND_UP(sendlen, mss); - last_len = sendlen - (send_bufs - 1) * mss; - - /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, 0, NULL); - - /* Finally, queue to tap */ - dlen = mss; - seq = conn->seq_to_tap; - for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; - - if (i == send_bufs - 1) - dlen = last_len; - - tcp_data_to_tap(c, conn, dlen, no_csum, seq); - seq += dlen; - } - - conn_flag(c, conn, ACK_FROM_TAP_DUE); - - return 0; - -err: - if (errno != EAGAIN && errno != EWOULDBLOCK) { - ret = -errno; - tcp_rst(c, conn); - } - - return ret; + return tcp_buf_data_from_sock(c, conn); } /** diff --git a/tcp_buf.c b/tcp_buf.c new file mode 100644 index 000000000000..87923029a958 --- /dev/null +++ b/tcp_buf.c @@ -0,0 +1,526 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tcp_buf.c - TCP L2-L4 translation state machine + * + * Copyright (c) 2020-2022 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <errno.h> + +#include <netinet/ip.h> + +#include <linux/tcp.h> + +#include "util.h" +#include "ip.h" +#include "iov.h" +#include "passt.h" +#include "tap.h" +#include "siphash.h" +#include "inany.h" +#include "tcp_conn.h" +#include "tcp_internal.h" +#include "tcp_buf.h" + +#define TCP_FRAMES_MEM 128 +#define TCP_FRAMES \ + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) + +/** + * tcp_buf_seq_update - Sequences to update with length of frames once sent + * @seq: Pointer to sequence number sent to tap-side, to be updated + * @len: TCP payload length + */ +struct tcp_buf_seq_update { + uint32_t *seq; + uint16_t len; +}; + +/* Static buffers */ +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; + +static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; +/* IPv4 headers */ +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +/* TCP segments with payload for IPv4 frames */ +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); + +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_payload_used; + +static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP segment without payload */ +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; +/* TCP segments without payload for IPv4 frames */ +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp4_flags_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; + +static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; +/* IPv6 headers */ +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; + +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); + +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp6_payload_used; + +static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; +/* IPv6 headers for TCP segment without payload */ +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; +/* TCP segment without payload for IPv6 frames */ +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_flags_used; + +/* recvmsg()/sendmsg() data for tap */ +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; + +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_TAP tap backend specific header + * @TCP_IOV_ETH Ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_TAP = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; + +/** + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses + * @eth_d: Ethernet destination address, NULL if unchanged + * @eth_s: Ethernet source address, NULL if unchanged + */ +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) +{ + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); +} + +/** + * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * @c: Execution context + */ +void tcp_sock4_iov_init(const struct ctx *c) +{ + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + + for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { + tcp4_payload_ip[i] = iph; + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_payload[i].th.ack = 1; + } + + for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { + tcp4_flags_ip[i] = iph; + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_flags[i].th.ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_flags_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; + } +} + +/** + * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets + * @c: Execution context + */ +void tcp_sock6_iov_init(const struct ctx *c) +{ + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + + for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { + tcp6_payload_ip[i] = ip6; + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_payload[i].th.ack = 1; + } + + for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { + tcp6_flags_ip[i] = ip6; + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_flags[i].th .ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_flags_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + } +} + +/** + * tcp_flags_flush() - Send out buffers for segments with no data (flags) + * @c: Execution context + */ +void tcp_flags_flush(const struct ctx *c) +{ + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp6_flags_used); + tcp6_flags_used = 0; + + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp4_flags_used); + tcp4_flags_used = 0; +} + +/** + * tcp_payload_flush() - Send out buffers for segments with data + * @c: Execution context + */ +void tcp_payload_flush(const struct ctx *c) +{ + unsigned i; + size_t m; + + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, + tcp6_payload_used); + for (i = 0; i < m; i++) + *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + tcp6_payload_used = 0; + + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, + tcp4_payload_used); + for (i = 0; i < m; i++) + *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + tcp4_payload_used = 0; +} + +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + size_t optlen = 0; + struct iovec *iov; + size_t l4len; + int ret; + + if (CONN_V4(conn)) { + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + NULL, conn->seq_to_tap); + } else { + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + conn->seq_to_tap); + } + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (flags & DUP_ACK) { + struct iovec *dup_iov; + int i; + + if (CONN_V4(conn)) + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + for (i = 0; i < TCP_NUM_IOVS; i++) + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + } + + if (CONN_V4(conn)) { + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } else { + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } + + return 0; +} + +/** + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer + * @c: Execution context + * @conn: Connection pointer + * @dlen: TCP payload length + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer + * @seq: Sequence number to be sent + */ +void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, + ssize_t dlen, int no_csum, uint32_t seq) +{ + uint32_t *seq_update = &conn->seq_to_tap; + struct iovec *iov; + size_t l4len; + + if (CONN_V4(conn)) { + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } + + tcp4_seq_update[tcp4_payload_used].seq = seq_update; + tcp4_seq_update[tcp4_payload_used].len = dlen; + + iov = tcp4_l2_iov[tcp4_payload_used++]; + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + check, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); + } else if (CONN_V6(conn)) { + tcp6_seq_update[tcp6_payload_used].seq = seq_update; + tcp6_seq_update[tcp6_payload_used].len = dlen; + + iov = tcp6_l2_iov[tcp6_payload_used++]; + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); + } +} + +/** + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: negative on connection reset, 0 otherwise + * + * #syscalls recvmsg + */ +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; + int sendlen, len, dlen, v4 = CONN_V4(conn); + int s = conn->sock, i, ret = 0; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + uint32_t already_sent, seq; + struct iovec *iov; + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + if (fill_bufs > TCP_FRAMES) { + fill_bufs = TCP_FRAMES; + iov_rem = 0; + } else { + iov_rem = (wnd_scaled - already_sent) % mss; + } + + mh_sock.msg_iov = iov_sock; + mh_sock.msg_iovlen = fill_bufs + 1; + + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + tcp_payload_flush(c); + + /* Silence Coverity CWE-125 false positive */ + tcp4_payload_used = tcp6_payload_used = 0; + } + + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + if (v4) + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; + else + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; + iov->iov_len = mss; + } + if (iov_rem) + iov_sock[fill_bufs].iov_len = iov_rem; + + /* Receive into buffers, don't dequeue until acknowledged by guest. */ + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + sendlen = len - already_sent; + if (sendlen <= 0) { + conn_flag(c, conn, STALLED); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + send_bufs = DIV_ROUND_UP(sendlen, mss); + last_len = sendlen - (send_bufs - 1) * mss; + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* Finally, queue to tap */ + dlen = mss; + seq = conn->seq_to_tap; + for (i = 0; i < send_bufs; i++) { + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; + + if (i == send_bufs - 1) + dlen = last_len; + + tcp_data_to_tap(c, conn, dlen, no_csum, seq); + seq += dlen; + } + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; + +err: + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_buf.h b/tcp_buf.h new file mode 100644 index 000000000000..14be7b945285 --- /dev/null +++ b/tcp_buf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_BUF_H +#define TCP_BUF_H + +void tcp_sock4_iov_init(const struct ctx *c); +void tcp_sock6_iov_init(const struct ctx *c); +void tcp_flags_flush(const struct ctx *c); +void tcp_payload_flush(const struct ctx *c); +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); + +#endif /*TCP_BUF_H */ diff --git a/tcp_internal.h b/tcp_internal.h new file mode 100644 index 000000000000..e47b64a68afd --- /dev/null +++ b/tcp_internal.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_INTERNAL_H +#define TCP_INTERNAL_H + +#define MAX_WS 8 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) + +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct iphdr), \ + sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct ipv6hdr), \ + sizeof(uint32_t)) + +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +/* Flags for internal usage */ +#define DUP_ACK (1 << 5) +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_MSS_LEN 4 +#define OPT_WS 3 +#define OPT_WS_LEN 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +extern char tcp_buf_discard [MAX_WINDOW]; + +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag); +#define conn_flag(c, conn, flag) \ + do { \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + + +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event); +#define conn_event(c, conn, event) \ + do { \ + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); +#define tcp_rst(c, conn) \ + do { \ + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ + tcp_rst_do(c, conn); \ + } while (0) + +size_t tcp_fill_headers4(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *iph, struct tcphdr *th, + size_t dlen, const uint16_t *check, + uint32_t seq); +size_t tcp_fill_headers6(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct ipv6hdr *ip6h, struct tcphdr *th, + size_t dlen, uint32_t seq); +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo); +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags, + struct tcphdr *th, char *data, size_t *optlen); + +#endif /* TCP_INTERNAL_H */ -- 2.44.0
On Fri, May 31, 2024 at 04:23:37PM +0200, Laurent Vivier wrote: Commit message.Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 5 +- tcp.c | 575 ++----------------------------------------------- tcp_buf.c | 526 ++++++++++++++++++++++++++++++++++++++++++++ tcp_buf.h | 16 ++ tcp_internal.h | 87 ++++++++ 5 files changed, 652 insertions(+), 557 deletions(-) create mode 100644 tcp_buf.c create mode 100644 tcp_buf.h create mode 100644 tcp_internal.h diff --git a/Makefile b/Makefile index 8ea175762e36..1ac2e5e0053f 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h + siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ + udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/tcp.c b/tcp.c index a6f43010f58f..48d8f7c6d696 100644 --- a/tcp.c +++ b/tcp.c @@ -302,28 +302,14 @@ #include "flow.h" #include "flow_table.h" - -#define TCP_FRAMES_MEM 128 -#define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) +#include "tcp_internal.h" +#include "tcp_buf.h" #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) -#define MAX_WS 8 -#define MAX_WINDOW (1 << (16 + (MAX_WS))) - /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 -#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ - sizeof(struct tcphdr) - \ - sizeof(struct iphdr), \ - sizeof(uint32_t)) -#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ - sizeof(struct tcphdr) - \ - sizeof(struct ipv6hdr), \ - sizeof(uint32_t)) - #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND # define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd) @@ -345,33 +331,10 @@ */ #define SOL_TCP IPPROTO_TCP -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) - -#define FIN (1 << 0) -#define SYN (1 << 1) -#define RST (1 << 2) -#define ACK (1 << 4) -/* Flags for internal usage */ -#define DUP_ACK (1 << 5) #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ -#define OPT_EOL 0 -#define OPT_NOP 1 -#define OPT_MSS 2 -#define OPT_MSS_LEN 4 -#define OPT_WS 3 -#define OPT_WS_LEN 3 -#define OPT_SACKP 4 -#define OPT_SACK 5 -#define OPT_TS 8 - #define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) -#define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -408,114 +371,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -/** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length - */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; -};This will conflict with Jon's upcoming changes, and I think it will be simpler if his go first (although they have taken rather longer to land than I was expecting).-/* Static buffers */ -/** - * struct tcp_payload_t - TCP header and data to send segments with payload - * @th: TCP header - * @data: TCP data - */ -struct tcp_payload_t { - struct tcphdr th; - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/** - * struct tcp_flags_t - TCP header and data to send zero-length - * segments (flags) - * @th: TCP header - * @opts TCP options - */ -struct tcp_flags_t { - struct tcphdr th; - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/* Ethernet header for IPv4 frames */ -static struct ethhdr tcp4_eth_src; - -static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers */ -static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; -/* TCP segments with payload for IPv4 frames */ -static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; - -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); - -static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; -static unsigned int tcp4_payload_used; - -static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers for TCP segment without payload */ -static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; -/* TCP segments without payload for IPv4 frames */ -static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp4_flags_used; - -/* Ethernet header for IPv6 frames */ -static struct ethhdr tcp6_eth_src; - -static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers */ -static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; -/* TCP headers and data for IPv6 frames */ -static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; - -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); - -static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; -static unsigned int tcp6_payload_used; - -static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers for TCP segment without payload */ -static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; -/* TCP segment without payload for IPv6 frames */ -static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp6_flags_used; - -/* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; - -/* - * enum tcp_iov_parts - I/O vector parts for one TCP frame - * @TCP_IOV_TAP tap backend specific header - * @TCP_IOV_ETH Ethernet header - * @TCP_IOV_IP IP (v4/v6) header - * @TCP_IOV_PAYLOAD IP payload (TCP header + data) - * @TCP_NUM_IOVS the number of entries in the iovec array - */ -enum tcp_iov_parts { - TCP_IOV_TAP = 0, - TCP_IOV_ETH = 1, - TCP_IOV_IP = 2, - TCP_IOV_PAYLOAD = 3, - TCP_NUM_IOVS -}; - -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +char tcp_buf_discard [MAX_WINDOW]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -560,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLRDHUP; } -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag); -#define conn_flag(c, conn, flag) \ - do { \ - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ - } while (0) - /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events * @c: Execution context @@ -679,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag) +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag) { if (flag & (flag - 1)) { int flag_index = fls(~flag); @@ -730,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c, * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event) +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event) { int prev, new, num = fls(event); @@ -779,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -#define conn_event(c, conn, event) \ - do { \ - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ - conn_event_do(c, conn, event); \ - } while (0) - /** * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint * @conn: Connection pointer @@ -914,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) th->check = csum(th, l4len, sum); } -/** - * tcp_update_l2_buf() - Update Ethernet header buffers with addresses - * @eth_d: Ethernet destination address, NULL if unchanged - * @eth_s: Ethernet source address, NULL if unchanged - */ -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) -{ - eth_update_mac(&tcp4_eth_src, eth_d, eth_s); - eth_update_mac(&tcp6_eth_src, eth_d, eth_s); -} - -/** - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets - * @c: Execution context - */ -static void tcp_sock4_iov_init(const struct ctx *c) -{ - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); - - for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { - tcp4_payload_ip[i] = iph; - tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { - tcp4_flags_ip[i] = iph; - tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_flags[i].th.ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; - } -} - -/** - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets - * @c: Execution context - */ -static void tcp_sock6_iov_init(const struct ctx *c) -{ - struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); - - for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { - tcp6_payload_ip[i] = ip6; - tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { - tcp6_flags_ip[i] = ip6; - tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_flags[i].th .ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; - } -} - /** * tcp_opt_get() - Get option, and value if any, from TCP header * @opts: Pointer to start of TCP options in header @@ -1235,50 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn) return true; } -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); -#define tcp_rst(c, conn) \ - do { \ - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ - tcp_rst_do(c, conn); \ - } while (0) - -/** - * tcp_flags_flush() - Send out buffers for segments with no data (flags) - * @c: Execution context - */ -static void tcp_flags_flush(const struct ctx *c) -{ - tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp6_flags_used); - tcp6_flags_used = 0; - - tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp4_flags_used); - tcp4_flags_used = 0; -} - -/** - * tcp_payload_flush() - Send out buffers for segments with data - * @c: Execution context - */ -static void tcp_payload_flush(const struct ctx *c) -{ - unsigned i; - size_t m; - - m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, - tcp6_payload_used); - for (i = 0; i < m; i++) - *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; - tcp6_payload_used = 0; - - m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, - tcp4_payload_used); - for (i = 0; i < m; i++) - *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; - tcp4_payload_used = 0; -} - /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context @@ -1326,7 +1026,7 @@ static void tcp_fill_header(struct tcphdr *th, * * Return: The IPv4 payload length, host order */ -static size_t tcp_fill_headers4(const struct ctx *c, +size_t tcp_fill_headers4(const struct ctx *c, const struct tcp_tap_conn *conn, struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, @@ -1369,11 +1069,11 @@ static size_t tcp_fill_headers4(const struct ctx *c, * * Return: The IPv6 payload length, host order */ -static size_t tcp_fill_headers6(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcphdr *th, - size_t dlen, uint32_t seq) +size_t tcp_fill_headers6(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct ipv6hdr *ip6h, struct tcphdr *th, + size_t dlen, uint32_t seq) { size_t l4len = dlen + sizeof(*th); @@ -1410,8 +1110,8 @@ static size_t tcp_fill_headers6(const struct ctx *c, * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; @@ -1530,7 +1230,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * 0 if there is no flag to send * 1 otherwise */ -static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen) { @@ -1620,69 +1320,9 @@ static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, return 1; } -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { - struct tcp_flags_t *payload; - size_t optlen = 0; - struct iovec *iov; - size_t l4len; - int ret; - - if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - - ret = tcp_fill_flag_header(c, conn, flags, &payload->th, - payload->opts, &optlen); - if (ret <= 0) - return ret; - - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - NULL, conn->seq_to_tap); - } else { - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - - ret = tcp_fill_flag_header(c, conn, flags, &payload->th, - payload->opts, &optlen); - if (ret <= 0) - return ret; - - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, optlen, - conn->seq_to_tap); - } - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - - if (flags & DUP_ACK) { - struct iovec *dup_iov; - int i; - - if (CONN_V4(conn)) - dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - for (i = 0; i < TCP_NUM_IOVS; i++) - memcpy(dup_iov[i].iov_base, iov[i].iov_base, - iov[i].iov_len); - dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; - } - - if (CONN_V4(conn)) { - if (tcp4_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } else { - if (tcp6_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } - - return 0; + return tcp_buf_send_flag(c, conn, flags); } /** @@ -1690,7 +1330,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -2117,184 +1757,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) return 0; } -/** - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer - * @c: Execution context - * @conn: Connection pointer - * @dlen: TCP payload length - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer - * @seq: Sequence number to be sent - */ -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t dlen, int no_csum, uint32_t seq) -{ - uint32_t *seq_update = &conn->seq_to_tap; - struct iovec *iov; - size_t l4len; - - if (CONN_V4(conn)) { - struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; - const uint16_t *check = NULL; - - if (no_csum) { - struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; - check = &iph->check; - } - - tcp4_seq_update[tcp4_payload_used].seq = seq_update; - tcp4_seq_update[tcp4_payload_used].len = dlen; - - iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - check, seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp4_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); - } else if (CONN_V6(conn)) { - tcp6_seq_update[tcp6_payload_used].seq = seq_update; - tcp6_seq_update[tcp6_payload_used].len = dlen; - - iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, - iov[TCP_IOV_IP].iov_base, - iov[TCP_IOV_PAYLOAD].iov_base, dlen, - seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp6_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); - } -} - -/** - * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window - * @c: Execution context - * @conn: Connection pointer - * - * Return: negative on connection reset, 0 otherwise - * - * #syscalls recvmsg - */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int sendlen, len, dlen, v4 = CONN_V4(conn); - int s = conn->sock, i, ret = 0; - struct msghdr mh_sock = { 0 }; - uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; - struct iovec *iov; - - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - - if (SEQ_LT(already_sent, 0)) { - /* RFC 761, section 2.1. */ - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", - conn->seq_ack_from_tap, conn->seq_to_tap); - conn->seq_to_tap = conn->seq_ack_from_tap; - already_sent = 0; - } - - if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); - return 0; - } - - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); - if (fill_bufs > TCP_FRAMES) { - fill_bufs = TCP_FRAMES; - iov_rem = 0; - } else { - iov_rem = (wnd_scaled - already_sent) % mss; - } - - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - - if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || - (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { - tcp_payload_flush(c); - - /* Silence Coverity CWE-125 false positive */ - tcp4_payload_used = tcp6_payload_used = 0; - } - - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - if (v4) - iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; - else - iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; - iov->iov_len = mss; - } - if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; - - /* Receive into buffers, don't dequeue until acknowledged by guest. */ - do - len = recvmsg(s, &mh_sock, MSG_PEEK); - while (len < 0 && errno == EINTR); - - if (len < 0) - goto err; - - if (!len) { - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_send_flag(c, conn, FIN | ACK))) { - tcp_rst(c, conn); - return ret; - } - - conn_event(c, conn, TAP_FIN_SENT); - } - - return 0; - } - - sendlen = len - already_sent; - if (sendlen <= 0) { - conn_flag(c, conn, STALLED); - return 0; - } - - conn_flag(c, conn, ~STALLED); - - send_bufs = DIV_ROUND_UP(sendlen, mss); - last_len = sendlen - (send_bufs - 1) * mss; - - /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, 0, NULL); - - /* Finally, queue to tap */ - dlen = mss; - seq = conn->seq_to_tap; - for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; - - if (i == send_bufs - 1) - dlen = last_len; - - tcp_data_to_tap(c, conn, dlen, no_csum, seq); - seq += dlen; - } - - conn_flag(c, conn, ACK_FROM_TAP_DUE); - - return 0; - -err: - if (errno != EAGAIN && errno != EWOULDBLOCK) { - ret = -errno; - tcp_rst(c, conn); - } - - return ret; + return tcp_buf_data_from_sock(c, conn); } /** diff --git a/tcp_buf.c b/tcp_buf.c new file mode 100644 index 000000000000..87923029a958 --- /dev/null +++ b/tcp_buf.c @@ -0,0 +1,526 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tcp_buf.c - TCP L2-L4 translation state machineThis description doesn't appear correct, or at least not complete, for the new file.+ * + * Copyright (c) 2020-2022 Red Hat GmbHAnd this should probably be updated since you're touching it too. Maybe go with the plain "Copyright Red Hat" that Red Hat legal seems to recommend.+ * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <errno.h> + +#include <netinet/ip.h> + +#include <linux/tcp.h> + +#include "util.h" +#include "ip.h" +#include "iov.h" +#include "passt.h" +#include "tap.h" +#include "siphash.h" +#include "inany.h" +#include "tcp_conn.h" +#include "tcp_internal.h" +#include "tcp_buf.h" + +#define TCP_FRAMES_MEM 128 +#define TCP_FRAMES \ + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) + +/** + * tcp_buf_seq_update - Sequences to update with length of frames once sent + * @seq: Pointer to sequence number sent to tap-side, to be updated + * @len: TCP payload length + */ +struct tcp_buf_seq_update { + uint32_t *seq; + uint16_t len; +}; + +/* Static buffers */ +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; + +static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; +/* IPv4 headers */ +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +/* TCP segments with payload for IPv4 frames */ +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); + +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_payload_used; + +static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP segment without payload */ +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; +/* TCP segments without payload for IPv4 frames */ +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp4_flags_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; + +static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; +/* IPv6 headers */ +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; + +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); + +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp6_payload_used; + +static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; +/* IPv6 headers for TCP segment without payload */ +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; +/* TCP segment without payload for IPv6 frames */ +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_flags_used; + +/* recvmsg()/sendmsg() data for tap */ +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; + +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_TAP tap backend specific header + * @TCP_IOV_ETH Ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_TAP = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; + +/** + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses + * @eth_d: Ethernet destination address, NULL if unchanged + * @eth_s: Ethernet source address, NULL if unchanged + */ +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) +{ + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); +} + +/** + * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * @c: Execution context + */ +void tcp_sock4_iov_init(const struct ctx *c) +{ + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + + for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { + tcp4_payload_ip[i] = iph; + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_payload[i].th.ack = 1; + } + + for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { + tcp4_flags_ip[i] = iph; + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_flags[i].th.ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_flags_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; + } +} + +/** + * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets + * @c: Execution context + */ +void tcp_sock6_iov_init(const struct ctx *c) +{ + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + + for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { + tcp6_payload_ip[i] = ip6; + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_payload[i].th.ack = 1; + } + + for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { + tcp6_flags_ip[i] = ip6; + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_flags[i].th .ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_flags_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + } +} + +/** + * tcp_flags_flush() - Send out buffers for segments with no data (flags) + * @c: Execution context + */ +void tcp_flags_flush(const struct ctx *c) +{ + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp6_flags_used); + tcp6_flags_used = 0; + + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp4_flags_used); + tcp4_flags_used = 0; +} + +/** + * tcp_payload_flush() - Send out buffers for segments with data + * @c: Execution context + */ +void tcp_payload_flush(const struct ctx *c) +{ + unsigned i; + size_t m; + + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, + tcp6_payload_used); + for (i = 0; i < m; i++) + *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + tcp6_payload_used = 0; + + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, + tcp4_payload_used); + for (i = 0; i < m; i++) + *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + tcp4_payload_used = 0; +} + +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + size_t optlen = 0; + struct iovec *iov; + size_t l4len; + int ret; + + if (CONN_V4(conn)) { + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + NULL, conn->seq_to_tap); + } else { + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + ret = tcp_fill_flag_header(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) + return ret; + + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, optlen, + conn->seq_to_tap); + } + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (flags & DUP_ACK) { + struct iovec *dup_iov; + int i; + + if (CONN_V4(conn)) + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + for (i = 0; i < TCP_NUM_IOVS; i++) + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + } + + if (CONN_V4(conn)) { + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } else { + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } + + return 0; +} + +/** + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer + * @c: Execution context + * @conn: Connection pointer + * @dlen: TCP payload length + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer + * @seq: Sequence number to be sent + */ +void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, + ssize_t dlen, int no_csum, uint32_t seq) +{ + uint32_t *seq_update = &conn->seq_to_tap; + struct iovec *iov; + size_t l4len; + + if (CONN_V4(conn)) { + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } + + tcp4_seq_update[tcp4_payload_used].seq = seq_update; + tcp4_seq_update[tcp4_payload_used].len = dlen; + + iov = tcp4_l2_iov[tcp4_payload_used++]; + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + check, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); + } else if (CONN_V6(conn)) { + tcp6_seq_update[tcp6_payload_used].seq = seq_update; + tcp6_seq_update[tcp6_payload_used].len = dlen; + + iov = tcp6_l2_iov[tcp6_payload_used++]; + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, dlen, + seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); + } +} + +/** + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: negative on connection reset, 0 otherwise + * + * #syscalls recvmsg + */ +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; + int sendlen, len, dlen, v4 = CONN_V4(conn); + int s = conn->sock, i, ret = 0; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + uint32_t already_sent, seq; + struct iovec *iov; + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + if (fill_bufs > TCP_FRAMES) { + fill_bufs = TCP_FRAMES; + iov_rem = 0; + } else { + iov_rem = (wnd_scaled - already_sent) % mss; + } + + mh_sock.msg_iov = iov_sock; + mh_sock.msg_iovlen = fill_bufs + 1; + + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + tcp_payload_flush(c); + + /* Silence Coverity CWE-125 false positive */ + tcp4_payload_used = tcp6_payload_used = 0; + } + + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + if (v4) + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; + else + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; + iov->iov_len = mss; + } + if (iov_rem) + iov_sock[fill_bufs].iov_len = iov_rem; + + /* Receive into buffers, don't dequeue until acknowledged by guest. */ + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + sendlen = len - already_sent; + if (sendlen <= 0) { + conn_flag(c, conn, STALLED); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + send_bufs = DIV_ROUND_UP(sendlen, mss); + last_len = sendlen - (send_bufs - 1) * mss; + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* Finally, queue to tap */ + dlen = mss; + seq = conn->seq_to_tap; + for (i = 0; i < send_bufs; i++) { + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; + + if (i == send_bufs - 1) + dlen = last_len; + + tcp_data_to_tap(c, conn, dlen, no_csum, seq); + seq += dlen; + } + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; + +err: + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_buf.h b/tcp_buf.h new file mode 100644 index 000000000000..14be7b945285 --- /dev/null +++ b/tcp_buf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_BUF_H +#define TCP_BUF_H + +void tcp_sock4_iov_init(const struct ctx *c); +void tcp_sock6_iov_init(const struct ctx *c); +void tcp_flags_flush(const struct ctx *c); +void tcp_payload_flush(const struct ctx *c); +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); + +#endif /*TCP_BUF_H */ diff --git a/tcp_internal.h b/tcp_internal.h new file mode 100644 index 000000000000..e47b64a68afd --- /dev/null +++ b/tcp_internal.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_INTERNAL_H +#define TCP_INTERNAL_H + +#define MAX_WS 8 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) + +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct iphdr), \ + sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct ipv6hdr), \ + sizeof(uint32_t)) + +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +/* Flags for internal usage */ +#define DUP_ACK (1 << 5) +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_MSS_LEN 4 +#define OPT_WS 3 +#define OPT_WS_LEN 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +extern char tcp_buf_discard [MAX_WINDOW]; + +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag); +#define conn_flag(c, conn, flag) \ + do { \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + + +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event); +#define conn_event(c, conn, event) \ + do { \ + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); +#define tcp_rst(c, conn) \ + do { \ + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ + tcp_rst_do(c, conn); \ + } while (0) + +size_t tcp_fill_headers4(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct iphdr *iph, struct tcphdr *th, + size_t dlen, const uint16_t *check, + uint32_t seq); +size_t tcp_fill_headers6(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct tap_hdr *taph, + struct ipv6hdr *ip6h, struct tcphdr *th, + size_t dlen, uint32_t seq); +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo); +int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags, + struct tcphdr *th, char *data, size_t *optlen); + +#endif /* TCP_INTERNAL_H */-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 97 +++++++++++++++++++++++++++++------------------------------ tap.h | 7 +++++ 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/tap.c b/tap.c index 2ea08491a51f..ab8d760efb11 100644 --- a/tap.c +++ b/tap.c @@ -920,6 +920,45 @@ append: return in->count; } +void pool_flush_all(void) +{ + pool_flush(pool_tap4); + pool_flush(pool_tap6); +} + +void tap_handler_all(struct ctx *c, const struct timespec *now) +{ + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); +} + +void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p, + const char *func, int line) +{ + const struct ethhdr *eh; + + pcap(p, l2len); + + eh = (struct ethhdr *)p; + + if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { + memcpy(c->mac_guest, eh->h_source, ETH_ALEN); + proto_update_l2_buf(c->mac_guest, NULL); + } + + switch (ntohs(eh->h_proto)) { + case ETH_P_ARP: + case ETH_P_IP: + packet_add_do(pool_tap4, l2len, p, func, line); + break; + case ETH_P_IPV6: + packet_add_do(pool_tap6, l2len, p, func, line); + break; + default: + break; + } +} + /** * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context @@ -946,7 +985,6 @@ static void tap_sock_reset(struct ctx *c) void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now) { - const struct ethhdr *eh; ssize_t n, rem; char *p; @@ -959,8 +997,7 @@ redo: p = pkt_buf; rem = 0; - pool_flush(pool_tap4); - pool_flush(pool_tap6); + pool_flush_all(); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); if (n < 0) { @@ -987,38 +1024,18 @@ redo: /* Complete the partial read above before discarding a malformed * frame, otherwise the stream will be inconsistent. */ - if (l2len < (ssize_t)sizeof(*eh) || + if (l2len < (ssize_t)sizeof(struct ethhdr) || l2len > (ssize_t)ETH_MAX_MTU) goto next; - pcap(p, l2len); - - eh = (struct ethhdr *)p; - - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, l2len, p); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, l2len, p); - break; - default: - break; - } + packet_add_all(c, l2len, p); next: p += l2len; n -= l2len; } - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler_all(c, now); /* We can't use EPOLLET otherwise. */ if (rem) @@ -1043,35 +1060,18 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, redo: n = 0; - pool_flush(pool_tap4); - pool_flush(pool_tap6); + pool_flush_all(); restart: while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { - const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); - if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) { + if (len < (ssize_t)sizeof(struct ethhdr) || + len > (ssize_t)ETH_MAX_MTU) { n += len; continue; } - pcap(pkt_buf + n, len); - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, len, pkt_buf + n); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, len, pkt_buf + n); - break; - default: - break; - } + packet_add_all(c, len, pkt_buf + n); if ((n += len) == TAP_BUF_BYTES) break; @@ -1082,8 +1082,7 @@ restart: ret = errno; - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler_all(c, now); if (len > 0 || ret == EAGAIN) return; diff --git a/tap.h b/tap.h index 2285a87093f9..3ffb7d6c3a91 100644 --- a/tap.h +++ b/tap.h @@ -70,5 +70,12 @@ void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); void tap_sock_init(struct ctx *c); +void pool_flush_all(void); +void tap_handler_all(struct ctx *c, const struct timespec *now); + +void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p, + const char *func, int line); +#define packet_add_all(p, l2len, start) \ + packet_add_all_do(p, l2len, start, __func__, __LINE__) #endif /* TAP_H */ -- 2.44.0
On Fri, May 31, 2024 at 04:23:38PM +0200, Laurent Vivier wrote: Commit message, in particular mentioning why you're going to need this exported. Otherwise LGTM, except that..Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 97 +++++++++++++++++++++++++++++------------------------------ tap.h | 7 +++++ 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/tap.c b/tap.c index 2ea08491a51f..ab8d760efb11 100644 --- a/tap.c +++ b/tap.c @@ -920,6 +920,45 @@ append: return in->count; }..function comments would be good on the new functions+void pool_flush_all(void) +{ + pool_flush(pool_tap4); + pool_flush(pool_tap6); +} + +void tap_handler_all(struct ctx *c, const struct timespec *now) +{ + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); +} + +void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p, + const char *func, int line) +{ + const struct ethhdr *eh; + + pcap(p, l2len); + + eh = (struct ethhdr *)p; + + if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { + memcpy(c->mac_guest, eh->h_source, ETH_ALEN); + proto_update_l2_buf(c->mac_guest, NULL); + } + + switch (ntohs(eh->h_proto)) { + case ETH_P_ARP: + case ETH_P_IP: + packet_add_do(pool_tap4, l2len, p, func, line); + break; + case ETH_P_IPV6: + packet_add_do(pool_tap6, l2len, p, func, line); + break; + default: + break; + } +} + /** * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context @@ -946,7 +985,6 @@ static void tap_sock_reset(struct ctx *c) void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now) { - const struct ethhdr *eh; ssize_t n, rem; char *p; @@ -959,8 +997,7 @@ redo: p = pkt_buf; rem = 0; - pool_flush(pool_tap4); - pool_flush(pool_tap6); + pool_flush_all(); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); if (n < 0) { @@ -987,38 +1024,18 @@ redo: /* Complete the partial read above before discarding a malformed * frame, otherwise the stream will be inconsistent. */ - if (l2len < (ssize_t)sizeof(*eh) || + if (l2len < (ssize_t)sizeof(struct ethhdr) || l2len > (ssize_t)ETH_MAX_MTU) goto next; - pcap(p, l2len); - - eh = (struct ethhdr *)p; - - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, l2len, p); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, l2len, p); - break; - default: - break; - } + packet_add_all(c, l2len, p); next: p += l2len; n -= l2len; } - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler_all(c, now); /* We can't use EPOLLET otherwise. */ if (rem) @@ -1043,35 +1060,18 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, redo: n = 0; - pool_flush(pool_tap4); - pool_flush(pool_tap6); + pool_flush_all(); restart: while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { - const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); - if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) { + if (len < (ssize_t)sizeof(struct ethhdr) || + len > (ssize_t)ETH_MAX_MTU) { n += len; continue; } - pcap(pkt_buf + n, len); - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, len, pkt_buf + n); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, len, pkt_buf + n); - break; - default: - break; - } + packet_add_all(c, len, pkt_buf + n); if ((n += len) == TAP_BUF_BYTES) break; @@ -1082,8 +1082,7 @@ restart: ret = errno; - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler_all(c, now); if (len > 0 || ret == EAGAIN) return; diff --git a/tap.h b/tap.h index 2285a87093f9..3ffb7d6c3a91 100644 --- a/tap.h +++ b/tap.h @@ -70,5 +70,12 @@ void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); void tap_sock_init(struct ctx *c); +void pool_flush_all(void); +void tap_handler_all(struct ctx *c, const struct timespec *now); + +void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p, + const char *func, int line); +#define packet_add_all(p, l2len, start) \ + packet_add_all_do(p, l2len, start, __func__, __LINE__) #endif /* TAP_H */-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- udp.c | 60 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/udp.c b/udp.c index 3abafc994537..4295d48046a6 100644 --- a/udp.c +++ b/udp.c @@ -556,7 +556,8 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @c: Execution context - * @bm: Pointer to udp_meta_t to update + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload @@ -565,15 +566,16 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, * Return: size of IPv4 payload (UDP header + data) */ static size_t udp_update_hdr4(const struct ctx *c, - struct udp_meta_t *bm, struct udp_payload_t *bp, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { - in_port_t srcport = ntohs(bm->s_in.sa4.sin_port); + in_port_t srcport = ntohs(s_in->sin_port); const struct in_addr dst = c->ip4.addr_seen; - struct in_addr src = bm->s_in.sa4.sin_addr; + struct in_addr src = s_in->sin_addr; size_t l4len = dlen + sizeof(bp->uh); - size_t l3len = l4len + sizeof(bm->ip4h); + size_t l3len = l4len + sizeof(*ip4h); if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && @@ -594,24 +596,24 @@ static size_t udp_update_hdr4(const struct ctx *c, src = c->ip4.gw; } - bm->ip4h.tot_len = htons(l3len); - bm->ip4h.daddr = dst.s_addr; - bm->ip4h.saddr = src.s_addr; - bm->ip4h.check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); + ip4h->tot_len = htons(l3len); + ip4h->daddr = dst.s_addr; + ip4h->saddr = src.s_addr; + ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); - bp->uh.source = bm->s_in.sa4.sin_port; + bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); csum_udp4(&bp->uh, src, dst, bp->data, dlen); - tap_hdr_update(&bm->taph, l3len + sizeof(udp4_eth_hdr)); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram * @c: Execution context - * @bm: Pointer to udp_meta_t to update + * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) + * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload @@ -620,13 +622,14 @@ static size_t udp_update_hdr4(const struct ctx *c, * Return: size of IPv6 payload (UDP header + data) */ static size_t udp_update_hdr6(const struct ctx *c, - struct udp_meta_t *bm, struct udp_payload_t *bp, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { - const struct in6_addr *src = &bm->s_in.sa6.sin6_addr; + const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; - in_port_t srcport = ntohs(bm->s_in.sa6.sin6_port); + in_port_t srcport = ntohs(s_in6->sin6_port); uint16_t l4len = dlen + sizeof(bp->uh); if (IN6_IS_ADDR_LINKLOCAL(src)) { @@ -663,19 +666,18 @@ static size_t udp_update_hdr6(const struct ctx *c, } - bm->ip6h.payload_len = htons(l4len); - bm->ip6h.daddr = *dst; - bm->ip6h.saddr = *src; - bm->ip6h.version = 6; - bm->ip6h.nexthdr = IPPROTO_UDP; - bm->ip6h.hop_limit = 255; + ip6h->payload_len = htons(l4len); + ip6h->daddr = *dst; + ip6h->saddr = *src; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 255; - bp->uh.source = bm->s_in.sa6.sin6_port; + bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); - bp->uh.len = bm->ip6h.payload_len; + bp->uh.len = ip6h->payload_len; csum_udp6(&bp->uh, src, dst, bp->data, dlen); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); return l4len; } @@ -708,11 +710,17 @@ static void udp_tap_send(const struct ctx *c, size_t l4len; if (v6) { - l4len = udp_update_hdr6(c, bm, bp, dstport, + l4len = udp_update_hdr6(c, &bm->ip6h, + &bm->s_in.sa6, bp, dstport, udp6_l2_mh_sock[i].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + + sizeof(udp6_eth_hdr)); } else { - l4len = udp_update_hdr4(c, bm, bp, dstport, + l4len = udp_update_hdr4(c, &bm->ip4h, + &bm->s_in.sa4, bp, dstport, udp4_l2_mh_sock[i].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + + sizeof(udp4_eth_hdr)); } tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len; } -- 2.44.0
On Fri, May 31, 2024 at 04:23:39PM +0200, Laurent Vivier wrote: Needs a proper commit message. Also, I'm not sure the 1-line description is still accurate in this current rebased version.Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- udp.c | 60 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/udp.c b/udp.c index 3abafc994537..4295d48046a6 100644 --- a/udp.c +++ b/udp.c @@ -556,7 +556,8 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @c: Execution context - * @bm: Pointer to udp_meta_t to update + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload @@ -565,15 +566,16 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, * Return: size of IPv4 payload (UDP header + data) */ static size_t udp_update_hdr4(const struct ctx *c, - struct udp_meta_t *bm, struct udp_payload_t *bp, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now)This change will definitely conflict with my upcoming UDP flow table stuff, but I think it's a good idea. So, I may end up cherry-picking this patch or an equivalent one for my series.{ - in_port_t srcport = ntohs(bm->s_in.sa4.sin_port); + in_port_t srcport = ntohs(s_in->sin_port); const struct in_addr dst = c->ip4.addr_seen; - struct in_addr src = bm->s_in.sa4.sin_addr; + struct in_addr src = s_in->sin_addr; size_t l4len = dlen + sizeof(bp->uh); - size_t l3len = l4len + sizeof(bm->ip4h); + size_t l3len = l4len + sizeof(*ip4h); if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && @@ -594,24 +596,24 @@ static size_t udp_update_hdr4(const struct ctx *c, src = c->ip4.gw; } - bm->ip4h.tot_len = htons(l3len); - bm->ip4h.daddr = dst.s_addr; - bm->ip4h.saddr = src.s_addr; - bm->ip4h.check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); + ip4h->tot_len = htons(l3len); + ip4h->daddr = dst.s_addr; + ip4h->saddr = src.s_addr; + ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); - bp->uh.source = bm->s_in.sa4.sin_port; + bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); csum_udp4(&bp->uh, src, dst, bp->data, dlen); - tap_hdr_update(&bm->taph, l3len + sizeof(udp4_eth_hdr)); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram * @c: Execution context - * @bm: Pointer to udp_meta_t to update + * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) + * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload @@ -620,13 +622,14 @@ static size_t udp_update_hdr4(const struct ctx *c, * Return: size of IPv6 payload (UDP header + data) */ static size_t udp_update_hdr6(const struct ctx *c, - struct udp_meta_t *bm, struct udp_payload_t *bp, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { - const struct in6_addr *src = &bm->s_in.sa6.sin6_addr; + const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; - in_port_t srcport = ntohs(bm->s_in.sa6.sin6_port); + in_port_t srcport = ntohs(s_in6->sin6_port); uint16_t l4len = dlen + sizeof(bp->uh); if (IN6_IS_ADDR_LINKLOCAL(src)) { @@ -663,19 +666,18 @@ static size_t udp_update_hdr6(const struct ctx *c, } - bm->ip6h.payload_len = htons(l4len); - bm->ip6h.daddr = *dst; - bm->ip6h.saddr = *src; - bm->ip6h.version = 6; - bm->ip6h.nexthdr = IPPROTO_UDP; - bm->ip6h.hop_limit = 255; + ip6h->payload_len = htons(l4len); + ip6h->daddr = *dst; + ip6h->saddr = *src; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 255; - bp->uh.source = bm->s_in.sa6.sin6_port; + bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); - bp->uh.len = bm->ip6h.payload_len; + bp->uh.len = ip6h->payload_len; csum_udp6(&bp->uh, src, dst, bp->data, dlen); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); return l4len; } @@ -708,11 +710,17 @@ static void udp_tap_send(const struct ctx *c, size_t l4len; if (v6) { - l4len = udp_update_hdr6(c, bm, bp, dstport, + l4len = udp_update_hdr6(c, &bm->ip6h, + &bm->s_in.sa6, bp, dstport, udp6_l2_mh_sock[i].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + + sizeof(udp6_eth_hdr)); } else { - l4len = udp_update_hdr4(c, bm, bp, dstport, + l4len = udp_update_hdr4(c, &bm->ip4h, + &bm->s_in.sa4, bp, dstport, udp4_l2_mh_sock[i].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + + sizeof(udp4_eth_hdr)); } tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len; }-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
We are going to introduce a variant of the function to use vhost-user buffers rather than passt internal buffers. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- passt.c | 2 +- udp.c | 6 +++--- udp.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/passt.c b/passt.c index a8c4cd3f8820..69a59f1e9b6d 100644 --- a/passt.c +++ b/passt.c @@ -365,7 +365,7 @@ loop: tcp_timer_handler(&c, ref); break; case EPOLL_TYPE_UDP: - udp_sock_handler(&c, ref, eventmask, &now); + udp_buf_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); diff --git a/udp.c b/udp.c index 4295d48046a6..a13013901e26 100644 --- a/udp.c +++ b/udp.c @@ -729,7 +729,7 @@ static void udp_tap_send(const struct ctx *c, } /** - * udp_sock_handler() - Handle new data from socket + * udp_buf_sock_handler() - Handle new data from socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -737,8 +737,8 @@ static void udp_tap_send(const struct ctx *c, * * #syscalls recvmmsg */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, + const struct timespec *now) { /* For not entirely clear reasons (data locality?) pasta gets * better throughput if we receive tap datagrams one at a diff --git a/udp.h b/udp.h index 9976b6231f1c..5865def20856 100644 --- a/udp.h +++ b/udp.h @@ -9,7 +9,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ void udp_portmap_clear(void); -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, -- 2.44.0
On Fri, May 31, 2024 at 04:23:40PM +0200, Laurent Vivier wrote:We are going to introduce a variant of the function to use vhost-user buffers rather than passt internal buffers. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au>--- passt.c | 2 +- udp.c | 6 +++--- udp.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/passt.c b/passt.c index a8c4cd3f8820..69a59f1e9b6d 100644 --- a/passt.c +++ b/passt.c @@ -365,7 +365,7 @@ loop: tcp_timer_handler(&c, ref); break; case EPOLL_TYPE_UDP: - udp_sock_handler(&c, ref, eventmask, &now); + udp_buf_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); diff --git a/udp.c b/udp.c index 4295d48046a6..a13013901e26 100644 --- a/udp.c +++ b/udp.c @@ -729,7 +729,7 @@ static void udp_tap_send(const struct ctx *c, } /** - * udp_sock_handler() - Handle new data from socket + * udp_buf_sock_handler() - Handle new data from socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -737,8 +737,8 @@ static void udp_tap_send(const struct ctx *c, * * #syscalls recvmmsg */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, + const struct timespec *now) { /* For not entirely clear reasons (data locality?) pasta gets * better throughput if we receive tap datagrams one at a diff --git a/udp.h b/udp.h index 9976b6231f1c..5865def20856 100644 --- a/udp.h +++ b/udp.h @@ -9,7 +9,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ void udp_portmap_clear(void); -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr,-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
As we are going to introduce the MODE_VU that will act like the mode MODE_PASST, compare to MODE_PASTA rather than to add a comparison to MODE_VU when we check for MODE_PASST. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- conf.c | 14 +++++++------- isolation.c | 10 +++++----- passt.c | 2 +- tap.c | 12 ++++++------ tcp_buf.c | 2 +- udp.c | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/conf.c b/conf.c index 50383a392f8d..b9d189ff4d26 100644 --- a/conf.c +++ b/conf.c @@ -147,7 +147,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if (fwd->mode) goto mode_conflict; - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("'all' port forwarding is only allowed for passt"); fwd->mode = FWD_ALL; @@ -1120,7 +1120,7 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid) */ static void conf_open_files(struct ctx *c) { - if (c->mode == MODE_PASST && c->fd_tap == -1) + if (c->mode != MODE_PASTA && c->fd_tap == -1) c->fd_tap_listen = tap_sock_unix_open(c->sock_path); c->pidfile_fd = pidfile_open(c->pidfile); @@ -1261,7 +1261,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcp_dns = 0; break; case 6: - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--no-dhcp-dns is for passt mode only"); c->no_dhcp_dns = 1; @@ -1273,7 +1273,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcp_dns_search = 0; break; case 8: - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--no-dhcp-search is for passt mode only"); c->no_dhcp_dns_search = 1; @@ -1328,7 +1328,7 @@ void conf(struct ctx *c, int argc, char **argv) break; case 14: fprintf(stdout, - c->mode == MODE_PASST ? "passt " : "pasta "); + c->mode == MODE_PASTA ? "pasta " : "passt "); fprintf(stdout, VERSION_BLOB); exit(EXIT_SUCCESS); case 15: @@ -1631,7 +1631,7 @@ void conf(struct ctx *c, int argc, char **argv) v6_only = true; break; case '1': - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--one-off is for passt mode only"); if (c->one_off) @@ -1678,7 +1678,7 @@ void conf(struct ctx *c, int argc, char **argv) conf_ugid(runas, &uid, &gid); if (logfile) { - logfile_init(c->mode == MODE_PASST ? "passt" : "pasta", + logfile_init(c->mode == MODE_PASTA ? "pasta" : "passt", logfile, logsize); } diff --git a/isolation.c b/isolation.c index f394e93b8526..ca2c68b52ec7 100644 --- a/isolation.c +++ b/isolation.c @@ -312,7 +312,7 @@ int isolate_prefork(const struct ctx *c) * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody * ever gets around seccomp profiles -- there's no harm in passing it. */ - if (!c->foreground || c->mode == MODE_PASST) + if (!c->foreground || c->mode != MODE_PASTA) flags |= CLONE_NEWPID; if (unshare(flags)) { @@ -379,12 +379,12 @@ void isolate_postfork(const struct ctx *c) prctl(PR_SET_DUMPABLE, 0); - if (c->mode == MODE_PASST) { - prog.len = (unsigned short)ARRAY_SIZE(filter_passt); - prog.filter = filter_passt; - } else { + if (c->mode == MODE_PASTA) { prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); prog.filter = filter_pasta; + } else { + prog.len = (unsigned short)ARRAY_SIZE(filter_passt); + prog.filter = filter_passt; } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || diff --git a/passt.c b/passt.c index 69a59f1e9b6d..b02a0df17347 100644 --- a/passt.c +++ b/passt.c @@ -333,7 +333,7 @@ loop: uint32_t eventmask = events[i].events; trace("%s: epoll event on %s %i (events: 0x%08x)", - c.mode == MODE_PASST ? "passt" : "pasta", + c.mode == MODE_PASTA ? "pasta" : "passt", EPOLL_TYPE_STR(ref.type), ref.fd, eventmask); switch (ref.type) { diff --git a/tap.c b/tap.c index ab8d760efb11..c436d2d7b72b 100644 --- a/tap.c +++ b/tap.c @@ -416,10 +416,10 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASST) - m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); - else + if (c->mode == MODE_PASTA) m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); + else + m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -1316,7 +1316,9 @@ void tap_sock_init(struct ctx *c) return; } - if (c->mode == MODE_PASST) { + if (c->mode == MODE_PASTA) + tap_sock_tun_init(c); + else { tap_sock_unix_init(c); /* In passt mode, we don't know the guest's MAC address until it @@ -1324,7 +1326,5 @@ void tap_sock_init(struct ctx *c) * first packets will reach it. */ memset(&c->mac_guest, 0xff, sizeof(c->mac_guest)); - } else { - tap_sock_tun_init(c); } } diff --git a/tcp_buf.c b/tcp_buf.c index 87923029a958..630e83e9a01a 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -35,7 +35,7 @@ #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) + (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) /** * tcp_buf_seq_update - Sequences to update with length of frames once sent diff --git a/udp.c b/udp.c index a13013901e26..def3d57a6183 100644 --- a/udp.c +++ b/udp.c @@ -748,7 +748,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve * whether we'll use tap or splice, always go one at a time * for pasta mode. */ - ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1); + ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); in_port_t dstport = ref.udp.port; bool v6 = ref.udp.v6; struct mmsghdr *mmh_recv; -- 2.44.0
On Fri, May 31, 2024 at 04:23:41PM +0200, Laurent Vivier wrote:As we are going to introduce the MODE_VU that will act like the mode MODE_PASST, compare to MODE_PASTA rather than to add a comparison to MODE_VU when we check for MODE_PASST. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
it was needed by a draft version of vhost-user, it is not needed anymore. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- iov.c | 39 --------------------------------------- iov.h | 3 --- 2 files changed, 42 deletions(-) diff --git a/iov.c b/iov.c index 52a7c014a171..3f9e229a305f 100644 --- a/iov.c +++ b/iov.c @@ -156,42 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) return len; } - -/** - * iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to - * another. - * - * @dst_iov: Pointer to the destination array of struct iovec describing - * the scatter/gather I/O vector to copy to. - * @dst_iov_cnt: Number of elements in the destination iov array. - * @iov: Pointer to the source array of struct iovec describing - * the scatter/gather I/O vector to copy from. - * @iov_cnt: Number of elements in the source iov array. - * @offset: Offset within the source iov from where copying should start. - * @bytes: Total number of bytes to copy from iov to dst_iov. - * - * Returns: The number of elements successfully copied to the destination - * iov array. - */ -/* cppcheck-suppress unusedFunction */ -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes) -{ - unsigned int i, j; - - i = iov_skip_bytes(iov, iov_cnt, offset, &offset); - - /* copying data */ - for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { - size_t len = MIN(bytes, iov[i].iov_len - offset); - - dst_iov[j].iov_base = (char *)iov[i].iov_base + offset; - dst_iov[j].iov_len = len; - j++; - bytes -= len; - offset = 0; - } - - return j; -} diff --git a/iov.h b/iov.h index 5668ca5f93bc..a9e1722713b3 100644 --- a/iov.h +++ b/iov.h @@ -28,7 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes); #endif /* IOVEC_H */ -- 2.44.0
On Fri, May 31, 2024 at 04:23:42PM +0200, Laurent Vivier wrote:it was needed by a draft version of vhost-user, it is not needed anymore. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> Stefano, I think this could be merged now, independent of the rest of the series.--- iov.c | 39 --------------------------------------- iov.h | 3 --- 2 files changed, 42 deletions(-) diff --git a/iov.c b/iov.c index 52a7c014a171..3f9e229a305f 100644 --- a/iov.c +++ b/iov.c @@ -156,42 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) return len; } - -/** - * iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to - * another. - * - * @dst_iov: Pointer to the destination array of struct iovec describing - * the scatter/gather I/O vector to copy to. - * @dst_iov_cnt: Number of elements in the destination iov array. - * @iov: Pointer to the source array of struct iovec describing - * the scatter/gather I/O vector to copy from. - * @iov_cnt: Number of elements in the source iov array. - * @offset: Offset within the source iov from where copying should start. - * @bytes: Total number of bytes to copy from iov to dst_iov. - * - * Returns: The number of elements successfully copied to the destination - * iov array. - */ -/* cppcheck-suppress unusedFunction */ -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes) -{ - unsigned int i, j; - - i = iov_skip_bytes(iov, iov_cnt, offset, &offset); - - /* copying data */ - for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { - size_t len = MIN(bytes, iov[i].iov_len - offset); - - dst_iov[j].iov_base = (char *)iov[i].iov_base + offset; - dst_iov[j].iov_len = len; - j++; - bytes -= len; - offset = 0; - } - - return j; -} diff --git a/iov.h b/iov.h index 5668ca5f93bc..a9e1722713b3 100644 --- a/iov.h +++ b/iov.h @@ -28,7 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes); #endif /* IOVEC_H */-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
As tap_hdr is not used with vhost-user, remove it from tcp_fill_headers4() and tcp_fill_headers6() Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 8 -------- tcp_buf.c | 18 ++++++++++++++---- tcp_internal.h | 2 -- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tcp.c b/tcp.c index 48d8f7c6d696..433ab1fab30f 100644 --- a/tcp.c +++ b/tcp.c @@ -1017,7 +1017,6 @@ static void tcp_fill_header(struct tcphdr *th, * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @taph: tap backend specific header * @iph: Pointer to IPv4 header * @th: Pointer to TCP header * @dlen: TCP payload length @@ -1028,7 +1027,6 @@ static void tcp_fill_header(struct tcphdr *th, */ size_t tcp_fill_headers4(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, size_t dlen, const uint16_t *check, uint32_t seq) @@ -1051,8 +1049,6 @@ size_t tcp_fill_headers4(const struct ctx *c, tcp_update_check_tcp4(iph, th); - tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); - return l4len; } @@ -1060,7 +1056,6 @@ size_t tcp_fill_headers4(const struct ctx *c, * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @taph: tap backend specific header * @ip6h: Pointer to IPv6 header * @th: Pointer to TCP header * @dlen: TCP payload length @@ -1071,7 +1066,6 @@ size_t tcp_fill_headers4(const struct ctx *c, */ size_t tcp_fill_headers6(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ipv6hdr *ip6h, struct tcphdr *th, size_t dlen, uint32_t seq) { @@ -1096,8 +1090,6 @@ size_t tcp_fill_headers6(const struct ctx *c, tcp_update_check_tcp6(ip6h, th); - tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); - return l4len; } diff --git a/tcp_buf.c b/tcp_buf.c index 630e83e9a01a..cd4549c06035 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -298,10 +298,12 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (ret <= 0) return ret; - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, optlen, NULL, conn->seq_to_tap); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct iphdr) + sizeof(struct ethhdr)); } else { iov = tcp6_l2_flags_iov[tcp6_flags_used++]; @@ -312,10 +314,13 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (ret <= 0) return ret; - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, optlen, conn->seq_to_tap); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct ipv6hdr) + + sizeof(struct ethhdr)); } iov[TCP_IOV_PAYLOAD].iov_len = l4len; @@ -373,10 +378,12 @@ void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp4_seq_update[tcp4_payload_used].len = dlen; iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, dlen, check, seq); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct iphdr) + sizeof(struct ethhdr)); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); @@ -385,10 +392,13 @@ void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp6_seq_update[tcp6_payload_used].len = dlen; iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, dlen, seq); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct ipv6hdr) + + sizeof(struct ethhdr)); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp6_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); diff --git a/tcp_internal.h b/tcp_internal.h index e47b64a68afd..5c7a52b8293c 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -70,13 +70,11 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); size_t tcp_fill_headers4(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, size_t dlen, const uint16_t *check, uint32_t seq); size_t tcp_fill_headers6(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ipv6hdr *ip6h, struct tcphdr *th, size_t dlen, uint32_t seq); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, -- 2.44.0
On Fri, May 31, 2024 at 04:23:43PM +0200, Laurent Vivier wrote:As tap_hdr is not used with vhost-user, remove it from tcp_fill_headers4() and tcp_fill_headers6()So, this is kind of at odds with how I intended tap_hdr to work. The idea was that it would cover any sort of specific headers we needed for the tap backend. Currently that's either a length field (qemu socket) or nothing (tuntap), but it could be other things if we need them. At the moment tap_hdr_update() fills in the vnet_len in all cases, even MODE_PASTA, because my guess was that it's cheaper to do that than to test the mode every time. If it matters we could make it do nothing in both PASTA and VU modes, and that's more what I'd expect rather than extracting it from the path here. I don't know if VU mode has any use for some backend specific "header" (would it make sense to put the descriptor ring entry here?).Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 8 -------- tcp_buf.c | 18 ++++++++++++++---- tcp_internal.h | 2 -- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tcp.c b/tcp.c index 48d8f7c6d696..433ab1fab30f 100644 --- a/tcp.c +++ b/tcp.c @@ -1017,7 +1017,6 @@ static void tcp_fill_header(struct tcphdr *th, * tcp_fill_headers4() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @taph: tap backend specific header * @iph: Pointer to IPv4 header * @th: Pointer to TCP header * @dlen: TCP payload length @@ -1028,7 +1027,6 @@ static void tcp_fill_header(struct tcphdr *th, */ size_t tcp_fill_headers4(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, size_t dlen, const uint16_t *check, uint32_t seq) @@ -1051,8 +1049,6 @@ size_t tcp_fill_headers4(const struct ctx *c, tcp_update_check_tcp4(iph, th); - tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); - return l4len; } @@ -1060,7 +1056,6 @@ size_t tcp_fill_headers4(const struct ctx *c, * tcp_fill_headers6() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @taph: tap backend specific header * @ip6h: Pointer to IPv6 header * @th: Pointer to TCP header * @dlen: TCP payload length @@ -1071,7 +1066,6 @@ size_t tcp_fill_headers4(const struct ctx *c, */ size_t tcp_fill_headers6(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ipv6hdr *ip6h, struct tcphdr *th, size_t dlen, uint32_t seq) { @@ -1096,8 +1090,6 @@ size_t tcp_fill_headers6(const struct ctx *c, tcp_update_check_tcp6(ip6h, th); - tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); - return l4len; } diff --git a/tcp_buf.c b/tcp_buf.c index 630e83e9a01a..cd4549c06035 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -298,10 +298,12 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (ret <= 0) return ret; - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, optlen, NULL, conn->seq_to_tap); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct iphdr) + sizeof(struct ethhdr)); } else { iov = tcp6_l2_flags_iov[tcp6_flags_used++]; @@ -312,10 +314,13 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (ret <= 0) return ret; - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, optlen, conn->seq_to_tap); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct ipv6hdr) + + sizeof(struct ethhdr)); } iov[TCP_IOV_PAYLOAD].iov_len = l4len; @@ -373,10 +378,12 @@ void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp4_seq_update[tcp4_payload_used].len = dlen; iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, dlen, check, seq); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct iphdr) + sizeof(struct ethhdr)); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); @@ -385,10 +392,13 @@ void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp6_seq_update[tcp6_payload_used].len = dlen; iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base, + l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, iov[TCP_IOV_PAYLOAD].iov_base, dlen, seq); + tap_hdr_update(iov[TCP_IOV_TAP].iov_base, + l4len + sizeof(struct ipv6hdr) + + sizeof(struct ethhdr)); iov[TCP_IOV_PAYLOAD].iov_len = l4len; if (tcp6_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); diff --git a/tcp_internal.h b/tcp_internal.h index e47b64a68afd..5c7a52b8293c 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -70,13 +70,11 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); size_t tcp_fill_headers4(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct iphdr *iph, struct tcphdr *th, size_t dlen, const uint16_t *check, uint32_t seq); size_t tcp_fill_headers6(const struct ctx *c, const struct tcp_tap_conn *conn, - struct tap_hdr *taph, struct ipv6hdr *ip6h, struct tcphdr *th, size_t dlen, uint32_t seq); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
buf_size is set to sizeof(pkt_buf) by default. And it seems more correct to provide the actual size of the buffer. Later a buf_size of 0 will allow vhost-user mode to detect guest memory buffers. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tap.c b/tap.c index c436d2d7b72b..28b515906f3b 100644 --- a/tap.c +++ b/tap.c @@ -602,7 +602,7 @@ resume: if (!eh) continue; if (ntohs(eh->h_proto) == ETH_P_ARP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l2len, (char *)eh); arp(c, pkt); @@ -642,7 +642,7 @@ resume: continue; if (iph->protocol == IPPROTO_ICMP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; @@ -661,7 +661,7 @@ resume: continue; if (iph->protocol == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l2len, (char *)eh); if (dhcp(c, pkt)) @@ -810,7 +810,7 @@ resume: } if (proto == IPPROTO_ICMPV6) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; @@ -834,7 +834,7 @@ resume: uh = (struct udphdr *)l4h; if (proto == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l4len, l4h); -- 2.44.0
On Fri, May 31, 2024 at 04:23:44PM +0200, Laurent Vivier wrote:buf_size is set to sizeof(pkt_buf) by default. And it seems more correct to provide the actual size of the buffer. Later a buf_size of 0 will allow vhost-user mode to detect guest memory buffers. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> Stefano, I think this one would also make sense to apply immediately, independently of the rest of the series.--- tap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tap.c b/tap.c index c436d2d7b72b..28b515906f3b 100644 --- a/tap.c +++ b/tap.c @@ -602,7 +602,7 @@ resume: if (!eh) continue; if (ntohs(eh->h_proto) == ETH_P_ARP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l2len, (char *)eh); arp(c, pkt); @@ -642,7 +642,7 @@ resume: continue; if (iph->protocol == IPPROTO_ICMP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; @@ -661,7 +661,7 @@ resume: continue; if (iph->protocol == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l2len, (char *)eh); if (dhcp(c, pkt)) @@ -810,7 +810,7 @@ resume: } if (proto == IPPROTO_ICMPV6) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; @@ -834,7 +834,7 @@ resume: uh = (struct udphdr *)l4h; if (proto == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l4len, l4h);-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson