On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. And as the payload buffer contains only the TCP header and the TCP data we can increase the size of the TCP data to USHRT_MAX - sizeof(struct tcphdr). As a side effect, these changes improve performance by a factor of x1.5. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 104 ++++++++++++++ tap.h | 16 +++ tcp.c | 429 ++++++++++++++++++++++++---------------------------------- 3 files changed, 296 insertions(+), 253 deletions(-) diff --git a/tap.c b/tap.c index c7b9372668ec..afd01cffbed6 100644 --- a/tap.c +++ b/tap.c @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c, return i; } +/** + * tap_send_iov_pasta() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is ignoredFits on single lines (I think it's more readable): * @c: Execution context * @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET unused * @n: Number of frames in @iov+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_pasta(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH], + TCP_IOV_NUM - TCP_IOV_ETH)) + break; + } + + return i; + +} + /** * tap_send_frames_passt() - Send multiple frames to the passt tap * @c: Execution context @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames...I would argue that this function prepares frames as well. Maybe: * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames+ * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length.* @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0;This could be initialised in the declaration (yes, it's "reset" at every loop iteration).+ for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))...which would now send a single frame at a time, but actually it can already send everything in one shot because it's using sendmsg(), if you move it outside of the loop and do something like (untested): return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);+ break; + } + + return i; + +} + /** * tap_send_frames() - Send out multiple prepared frames * @c: Execution context @@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n) return m; } +/** + * tap_send_iov() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * iovec array is:I think this description should be moved before the defines -- it could even be an enum with fixed numbers, and I just realised that in kernel-doc enums should be documented in the same way as structs (we didn't do that so far), that is: /** * enum tcp_iov_parts - I/O vector parts for one TCP frame * @TCP_IOV_NET: vnet length: host order, 32-bit frame length descriptor * ... */ enum tcp_iov_parts { /* vnet length (host order, 32-bit length descriptor) */ TCP_IOV_VNET = 0, [...] };+ * TCP_IOV_VNET (0) vnet length + * TCP_IOV_ETH (1) ethernet header + * TCP_IOV_IP (2) IP (v4/v6) header + * TCP_IOV_PAYLOAD (3) IP payload (TCP header + data) + * TCP_IOV_NUM (4) is the number of entries in the iovec array + * TCP_IOV_VNET entry is updated with passt, ignored with pasta. + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + size_t m; + unsigned int i; + + if (!n) + return 0; + + switch (c->mode) { + case MODE_PASST: + m = tap_send_iov_passt(c, iov, n); + break; + case MODE_PASTA: + m = tap_send_iov_pasta(c, iov, n); + break; + default: + ASSERT(0); + } + + if (m < n) + debug("tap: failed to send %zu frames of %zu", n - m, n); + + for (i = 0; i < m; i++) + pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH); + + return m; +} + /** * eth_update_mac() - Update tap L2 header with new Ethernet addresses * @eh: Ethernet headers to update diff --git a/tap.h b/tap.h index 437b9aa2b43f..2d8e7aa1101d 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,20 @@ #ifndef TAP_H #define TAP_H +/* + * TCP frame iovec array: + * TCP_IOV_VNET vnet length + * TCP_IOV_ETH ethernet header + * TCP_IOV_IP IP (v4/v6) header + * TCP_IOV_PAYLOAD IP payload (TCP header + data) + * TCP_IOV_NUM is the number of entries in the iovec array + */ +#define TCP_IOV_VNET 0 +#define TCP_IOV_ETH 1 +#define TCP_IOV_IP 2 +#define TCP_IOV_PAYLOAD 3 +#define TCP_IOV_NUM 4 + /** * struct tap_hdr - L2 and tap specific headers * @vnet_len: Frame length (for qemu socket transport) @@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c, const void *in, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n); +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n); void eth_update_mac(struct ethhdr *eh, const unsigned char *eth_d, const unsigned char *eth_s); void tap_listen_handler(struct ctx *c, uint32_t events); diff --git a/tcp.c b/tcp.c index d5eedf4d0138..5c8488108ef7 100644 --- a/tcp.c +++ b/tcp.c @@ -318,39 +318,7 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endifI'm so glad to see this going away.-#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS (USHRT_MAX - sizeof(struct tcphdr))We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4 should now be: #define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) - sizeof(struct iphdr) - sizeof(struct tcphdr), 4) ...and similar for IPv6.#define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +413,78 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * tcp_l2_flags_t - TCP header and data to send option flags'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).+ * @th: TCP header + * @opts TCP option flags */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_l2_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +};This should still be aligned (when declared below) with: #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif because yes, now you get the unsigned int alignment for free, but the 32-byte boundary for AVX2 (checksums) not really, so that needs to be there, and then for clarity I would keep the explicit attribute for the non-AVX2 case. By the way, I guess you could still combine the definition and declaration as it was done earlier.+/** + * tcp_l2_payload_t - TCP header and data to send data + * 32 bytes aligned to be able to use AVX2 checksum + * @th: TCP header + * @data: TCP data + */ +struct tcp_l2_payload_t { + struct tcphdr th; /* 20 bytes */ + uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; +/* IPv4 headers */ +static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv4 frames */ +static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; -/** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload - */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_buf[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP option flags frames */ +static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv4 frames */ +static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_l2_flags_buf_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; +/* IPv6 headers */ +static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; +/* IPv6 headers for TCP option flags frames */ +static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv6 frames */ +static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_l2_flags_buf_used; + /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)This doesn't update "L2 buffers" anymore, just the Ethernet addresses (function comment should be updated).{ - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void tcp_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c;...then drop it from the parameter list?- for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp4_l2_ip[i] = iph; + tcp4_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); + tcp4_l2_flags_ip[i] = iph; + tcp4_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp4_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i]; + + iov = tcp4_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i]; + } } /** @@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { - struct iovec *iov; + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp6_l2_ip[i] = ip6; + tcp6_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); + tcp6_l2_flags_ip[i] = ip6; + tcp6_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp6_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i]; + + iov = tcp6_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i]; + } } /** @@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); */ static void tcp_l2_flags_buf_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } @@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); + m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used); for (i = 0; i < m; i++) *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; tcp6_l2_buf_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); + m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used); for (i = 0; i < m; i++) *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; tcp4_l2_buf_used = 0; @@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffer * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c, */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_l2_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; size_t optlen = 0; struct iovec *iov; + struct iovec *dup_iov;This should now go before 'int s' (longest to shortest), or the declaration could be combined on the same line (it's really symmetric) with *iov.struct tcphdr *th; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++]; + dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used]; } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++]; + dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];Maybe you could move the dup_iov assignments below, and group these with the increments of tcp[46]_l2_flags_buf_used, otherwise I'll scream "off-by-one" every time I see this (even though it's not actually the case).} + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; if (flags & SYN) { int mss; @@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); - if (CONN_V4(conn)) - mss -= sizeof(struct iphdr); - else - mss -= sizeof(struct ipv6hdr); if (c->low_wmem && !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) @@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov, + optlen, NULL, + conn->seq_to_tap); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + if (flags & DUP_ACK) { + int i; + for (i = 0; i < TCP_IOV_NUM; i++) { + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[i].iov_len = iov[i].iov_len; + } + } + if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp4_l2_flags_buf_used++; - } - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) + if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp6_l2_flags_buf_used++; - } - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) + if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } @@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, unsigned int mss; int ret; + (void)conn; /* unused */ + if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; - if (CONN_V4(conn)) - mss = MIN(MSS4, mss); - else - mss = MIN(MSS6, mss); + mss = MIN(MSS, mss); return MIN(mss, USHRT_MAX); } @@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) + iov = tcp4_l2_iov[tcp4_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, check, seq); + if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) + iov = tcp6_l2_iov[tcp6_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, NULL, seq);Maybe use a temporary variable?+ if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } } @@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { + if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) { tcp_l2_data_buf_flush(c); /* Silence Coverity CWE-125 false positive */ @@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data; iov->iov_len = mss; } if (iov_rem)The rest looks good to me (at the moment :)). -- Stefano