To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. And as the payload buffer contains only the TCP header and the TCP data we can increase the size of the TCP data to USHRT_MAX - sizeof(struct tcphdr). As a side effect, these changes improve performance by a factor of x1.5. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 104 ++++++++++++++ tap.h | 16 +++ tcp.c | 429 ++++++++++++++++++++++++---------------------------------- 3 files changed, 296 insertions(+), 253 deletions(-) diff --git a/tap.c b/tap.c index c7b9372668ec..afd01cffbed6 100644 --- a/tap.c +++ b/tap.c @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c, return i; } +/** + * tap_send_iov_pasta() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is ignored + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_pasta(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH], + TCP_IOV_NUM - TCP_IOV_ETH)) + break; + } + + return i; + +} + /** * tap_send_frames_passt() - Send multiple frames to the passt tap * @c: Execution context @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length. + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0; + for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM)) + break; + } + + return i; + +} + /** * tap_send_frames() - Send out multiple prepared frames * @c: Execution context @@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n) return m; } +/** + * tap_send_iov() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * iovec array is: + * TCP_IOV_VNET (0) vnet length + * TCP_IOV_ETH (1) ethernet header + * TCP_IOV_IP (2) IP (v4/v6) header + * TCP_IOV_PAYLOAD (3) IP payload (TCP header + data) + * TCP_IOV_NUM (4) is the number of entries in the iovec array + * TCP_IOV_VNET entry is updated with passt, ignored with pasta. + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + size_t m; + unsigned int i; + + if (!n) + return 0; + + switch (c->mode) { + case MODE_PASST: + m = tap_send_iov_passt(c, iov, n); + break; + case MODE_PASTA: + m = tap_send_iov_pasta(c, iov, n); + break; + default: + ASSERT(0); + } + + if (m < n) + debug("tap: failed to send %zu frames of %zu", n - m, n); + + for (i = 0; i < m; i++) + pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH); + + return m; +} + /** * eth_update_mac() - Update tap L2 header with new Ethernet addresses * @eh: Ethernet headers to update diff --git a/tap.h b/tap.h index 437b9aa2b43f..2d8e7aa1101d 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,20 @@ #ifndef TAP_H #define TAP_H +/* + * TCP frame iovec array: + * TCP_IOV_VNET vnet length + * TCP_IOV_ETH ethernet header + * TCP_IOV_IP IP (v4/v6) header + * TCP_IOV_PAYLOAD IP payload (TCP header + data) + * TCP_IOV_NUM is the number of entries in the iovec array + */ +#define TCP_IOV_VNET 0 +#define TCP_IOV_ETH 1 +#define TCP_IOV_IP 2 +#define TCP_IOV_PAYLOAD 3 +#define TCP_IOV_NUM 4 + /** * struct tap_hdr - L2 and tap specific headers * @vnet_len: Frame length (for qemu socket transport) @@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c, const void *in, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n); +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n); void eth_update_mac(struct ethhdr *eh, const unsigned char *eth_d, const unsigned char *eth_s); void tap_listen_handler(struct ctx *c, uint32_t events); diff --git a/tcp.c b/tcp.c index d5eedf4d0138..5c8488108ef7 100644 --- a/tcp.c +++ b/tcp.c @@ -318,39 +318,7 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS (USHRT_MAX - sizeof(struct tcphdr)) #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +413,78 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * tcp_l2_flags_t - TCP header and data to send option flags + * @th: TCP header + * @opts TCP option flags */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_l2_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +}; +/** + * tcp_l2_payload_t - TCP header and data to send data + * 32 bytes aligned to be able to use AVX2 checksum + * @th: TCP header + * @data: TCP data + */ +struct tcp_l2_payload_t { + struct tcphdr th; /* 20 bytes */ + uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; +/* IPv4 headers */ +static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv4 frames */ +static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; -/** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload - */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_buf[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP option flags frames */ +static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv4 frames */ +static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_l2_flags_buf_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; +/* IPv6 headers */ +static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; +/* IPv6 headers for TCP option flags frames */ +static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv6 frames */ +static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_l2_flags_buf_used; + /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void tcp_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp4_l2_ip[i] = iph; + tcp4_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); + tcp4_l2_flags_ip[i] = iph; + tcp4_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp4_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i]; + + iov = tcp4_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i]; + } } /** @@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { - struct iovec *iov; + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp6_l2_ip[i] = ip6; + tcp6_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); + tcp6_l2_flags_ip[i] = ip6; + tcp6_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp6_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i]; + + iov = tcp6_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i]; + } } /** @@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); */ static void tcp_l2_flags_buf_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } @@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); + m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used); for (i = 0; i < m; i++) *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; tcp6_l2_buf_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); + m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used); for (i = 0; i < m; i++) *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; tcp4_l2_buf_used = 0; @@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffer * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c, */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_l2_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; size_t optlen = 0; struct iovec *iov; + struct iovec *dup_iov; struct tcphdr *th; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++]; + dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used]; } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++]; + dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used]; } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; if (flags & SYN) { int mss; @@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); - if (CONN_V4(conn)) - mss -= sizeof(struct iphdr); - else - mss -= sizeof(struct ipv6hdr); if (c->low_wmem && !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) @@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov, + optlen, NULL, + conn->seq_to_tap); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + if (flags & DUP_ACK) { + int i; + for (i = 0; i < TCP_IOV_NUM; i++) { + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[i].iov_len = iov[i].iov_len; + } + } + if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp4_l2_flags_buf_used++; - } - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) + if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp6_l2_flags_buf_used++; - } - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) + if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } @@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, unsigned int mss; int ret; + (void)conn; /* unused */ + if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; - if (CONN_V4(conn)) - mss = MIN(MSS4, mss); - else - mss = MIN(MSS6, mss); + mss = MIN(MSS, mss); return MIN(mss, USHRT_MAX); } @@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) + iov = tcp4_l2_iov[tcp4_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, check, seq); + if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) + iov = tcp6_l2_iov[tcp6_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, NULL, seq); + if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } } @@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { + if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) { tcp_l2_data_buf_flush(c); /* Silence Coverity CWE-125 false positive */ @@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data; iov->iov_len = mss; } if (iov_rem) -- 2.42.0
On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. And as the payload buffer contains only the TCP header and the TCP data we can increase the size of the TCP data to USHRT_MAX - sizeof(struct tcphdr). As a side effect, these changes improve performance by a factor of x1.5. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 104 ++++++++++++++ tap.h | 16 +++ tcp.c | 429 ++++++++++++++++++++++++---------------------------------- 3 files changed, 296 insertions(+), 253 deletions(-) diff --git a/tap.c b/tap.c index c7b9372668ec..afd01cffbed6 100644 --- a/tap.c +++ b/tap.c @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c, return i; } +/** + * tap_send_iov_pasta() - Send out multiple prepared frames...to the pasta tap. That is, this should get the description of the existing tap_send_frames_pasta(), and, I guess, tap_send_frames_pasta() should now switch to writev(), as it's used to send one single message from multiple buffers, otherwise you'll get a lot of system calls for pasta? I'm still in the middle of reviewing here, but I thought I would comment on this right away because it looks quite fundamental. -- Stefano
On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. And as the payload buffer contains only the TCP header and the TCP data we can increase the size of the TCP data to USHRT_MAX - sizeof(struct tcphdr). As a side effect, these changes improve performance by a factor of x1.5. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 104 ++++++++++++++ tap.h | 16 +++ tcp.c | 429 ++++++++++++++++++++++++---------------------------------- 3 files changed, 296 insertions(+), 253 deletions(-) diff --git a/tap.c b/tap.c index c7b9372668ec..afd01cffbed6 100644 --- a/tap.c +++ b/tap.c @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c, return i; } +/** + * tap_send_iov_pasta() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is ignoredFits on single lines (I think it's more readable): * @c: Execution context * @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET unused * @n: Number of frames in @iov+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_pasta(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH], + TCP_IOV_NUM - TCP_IOV_ETH)) + break; + } + + return i; + +} + /** * tap_send_frames_passt() - Send multiple frames to the passt tap * @c: Execution context @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames...I would argue that this function prepares frames as well. Maybe: * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames+ * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length.* @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0;This could be initialised in the declaration (yes, it's "reset" at every loop iteration).+ for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))...which would now send a single frame at a time, but actually it can already send everything in one shot because it's using sendmsg(), if you move it outside of the loop and do something like (untested): return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n);+ break; + } + + return i; + +} + /** * tap_send_frames() - Send out multiple prepared frames * @c: Execution context @@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n) return m; } +/** + * tap_send_iov() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * iovec array is:I think this description should be moved before the defines -- it could even be an enum with fixed numbers, and I just realised that in kernel-doc enums should be documented in the same way as structs (we didn't do that so far), that is: /** * enum tcp_iov_parts - I/O vector parts for one TCP frame * @TCP_IOV_NET: vnet length: host order, 32-bit frame length descriptor * ... */ enum tcp_iov_parts { /* vnet length (host order, 32-bit length descriptor) */ TCP_IOV_VNET = 0, [...] };+ * TCP_IOV_VNET (0) vnet length + * TCP_IOV_ETH (1) ethernet header + * TCP_IOV_IP (2) IP (v4/v6) header + * TCP_IOV_PAYLOAD (3) IP payload (TCP header + data) + * TCP_IOV_NUM (4) is the number of entries in the iovec array + * TCP_IOV_VNET entry is updated with passt, ignored with pasta. + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + size_t m; + unsigned int i; + + if (!n) + return 0; + + switch (c->mode) { + case MODE_PASST: + m = tap_send_iov_passt(c, iov, n); + break; + case MODE_PASTA: + m = tap_send_iov_pasta(c, iov, n); + break; + default: + ASSERT(0); + } + + if (m < n) + debug("tap: failed to send %zu frames of %zu", n - m, n); + + for (i = 0; i < m; i++) + pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH); + + return m; +} + /** * eth_update_mac() - Update tap L2 header with new Ethernet addresses * @eh: Ethernet headers to update diff --git a/tap.h b/tap.h index 437b9aa2b43f..2d8e7aa1101d 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,20 @@ #ifndef TAP_H #define TAP_H +/* + * TCP frame iovec array: + * TCP_IOV_VNET vnet length + * TCP_IOV_ETH ethernet header + * TCP_IOV_IP IP (v4/v6) header + * TCP_IOV_PAYLOAD IP payload (TCP header + data) + * TCP_IOV_NUM is the number of entries in the iovec array + */ +#define TCP_IOV_VNET 0 +#define TCP_IOV_ETH 1 +#define TCP_IOV_IP 2 +#define TCP_IOV_PAYLOAD 3 +#define TCP_IOV_NUM 4 + /** * struct tap_hdr - L2 and tap specific headers * @vnet_len: Frame length (for qemu socket transport) @@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c, const void *in, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n); +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n); void eth_update_mac(struct ethhdr *eh, const unsigned char *eth_d, const unsigned char *eth_s); void tap_listen_handler(struct ctx *c, uint32_t events); diff --git a/tcp.c b/tcp.c index d5eedf4d0138..5c8488108ef7 100644 --- a/tcp.c +++ b/tcp.c @@ -318,39 +318,7 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endifI'm so glad to see this going away.-#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS (USHRT_MAX - sizeof(struct tcphdr))We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4 should now be: #define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) - sizeof(struct iphdr) - sizeof(struct tcphdr), 4) ...and similar for IPv6.#define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +413,78 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * tcp_l2_flags_t - TCP header and data to send option flags'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).+ * @th: TCP header + * @opts TCP option flags */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_l2_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +};This should still be aligned (when declared below) with: #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif because yes, now you get the unsigned int alignment for free, but the 32-byte boundary for AVX2 (checksums) not really, so that needs to be there, and then for clarity I would keep the explicit attribute for the non-AVX2 case. By the way, I guess you could still combine the definition and declaration as it was done earlier.+/** + * tcp_l2_payload_t - TCP header and data to send data + * 32 bytes aligned to be able to use AVX2 checksum + * @th: TCP header + * @data: TCP data + */ +struct tcp_l2_payload_t { + struct tcphdr th; /* 20 bytes */ + uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; +/* IPv4 headers */ +static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv4 frames */ +static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; -/** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload - */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_buf[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP option flags frames */ +static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv4 frames */ +static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_l2_flags_buf_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; +/* IPv6 headers */ +static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; +/* IPv6 headers for TCP option flags frames */ +static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv6 frames */ +static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_l2_flags_buf_used; + /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)This doesn't update "L2 buffers" anymore, just the Ethernet addresses (function comment should be updated).{ - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void tcp_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c;...then drop it from the parameter list?- for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp4_l2_ip[i] = iph; + tcp4_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); + tcp4_l2_flags_ip[i] = iph; + tcp4_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp4_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i]; + + iov = tcp4_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i]; + } } /** @@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { - struct iovec *iov; + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp6_l2_ip[i] = ip6; + tcp6_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); + tcp6_l2_flags_ip[i] = ip6; + tcp6_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp6_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i]; + + iov = tcp6_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i]; + } } /** @@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); */ static void tcp_l2_flags_buf_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } @@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); + m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used); for (i = 0; i < m; i++) *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; tcp6_l2_buf_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); + m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used); for (i = 0; i < m; i++) *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; tcp4_l2_buf_used = 0; @@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffer * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c, */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_l2_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; size_t optlen = 0; struct iovec *iov; + struct iovec *dup_iov;This should now go before 'int s' (longest to shortest), or the declaration could be combined on the same line (it's really symmetric) with *iov.struct tcphdr *th; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++]; + dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used]; } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++]; + dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used];Maybe you could move the dup_iov assignments below, and group these with the increments of tcp[46]_l2_flags_buf_used, otherwise I'll scream "off-by-one" every time I see this (even though it's not actually the case).} + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; if (flags & SYN) { int mss; @@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); - if (CONN_V4(conn)) - mss -= sizeof(struct iphdr); - else - mss -= sizeof(struct ipv6hdr); if (c->low_wmem && !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) @@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov, + optlen, NULL, + conn->seq_to_tap); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + if (flags & DUP_ACK) { + int i; + for (i = 0; i < TCP_IOV_NUM; i++) { + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[i].iov_len = iov[i].iov_len; + } + } + if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp4_l2_flags_buf_used++; - } - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) + if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp6_l2_flags_buf_used++; - } - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) + if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } @@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, unsigned int mss; int ret; + (void)conn; /* unused */ + if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; - if (CONN_V4(conn)) - mss = MIN(MSS4, mss); - else - mss = MIN(MSS6, mss); + mss = MIN(MSS, mss); return MIN(mss, USHRT_MAX); } @@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) + iov = tcp4_l2_iov[tcp4_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, check, seq); + if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) + iov = tcp6_l2_iov[tcp6_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, NULL, seq);Maybe use a temporary variable?+ if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } } @@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { + if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) { tcp_l2_data_buf_flush(c); /* Silence Coverity CWE-125 false positive */ @@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data; iov->iov_len = mss; } if (iov_rem)The rest looks good to me (at the moment :)). -- Stefano
On 3/13/24 12:37, Stefano Brivio wrote:On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:...> diff --git a/tcp.c b/tcp.c > index d5eedf4d0138..5c8488108ef7 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -318,39 +318,7 @@...Is there a specification that limits the MSS? Or is it only a compatibility problem with the network stack implementation? At headers level the only limitation we have is the length field size in the IP header that is a 16bit (it's why I put "USHRT_MAX - sizeof(struct tcphdr)"). Thanks, Laurent-#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS (USHRT_MAX - sizeof(struct tcphdr))We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4 should now be: #define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) - sizeof(struct iphdr) - sizeof(struct tcphdr), 4) ...and similar for IPv6.
On Wed, 13 Mar 2024 15:42:27 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 3/13/24 12:37, Stefano Brivio wrote:Well, on Linux, virtual network interfaces such as virtio-net still have their maximum configurable MTU defined as ETH_MAX_MTU (same as the maximum IPv4 MTU, 65535), and that's a symmetric value (in standards, drivers, and elsewhere in the network stack). Side note: the TCP MSS doesn't need to be the same value for both directions, instead. But other than that, no, there are no normative references. It's an implementation "detail" if you want.On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:...> diff --git a/tcp.c b/tcp.c > index d5eedf4d0138..5c8488108ef7 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -318,39 +318,7 @@...Is there a specification that limits the MSS?-#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS (USHRT_MAX - sizeof(struct tcphdr))We can't exceed USHRT_MAX at Layer-2 level, though, so the MSS for IPv4 should now be: #define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct ethhdr) - sizeof(struct iphdr) - sizeof(struct tcphdr), 4) ...and similar for IPv6.Or is it only a compatibility problem with the network stack implementation?Kind of, yes. Except that:At headers level the only limitation we have is the length field size in the IP header that is a 16bit (it's why I put "USHRT_MAX - sizeof(struct tcphdr)")....in IPv4, that field also contains the length of the *IP header*, so, ignoring for a moment Linux and virtio-net with ETH_MAX_MTU, you would be limited to 65495 bytes (65535 minus 20 bytes of IP header, minus 20 bytes of TCP header). As RFC 791, section 3.1, states: Total Length: 16 bits Total Length is the length of the datagram, measured in octets, including internet header and data. This field allows the length of a datagram to be up to 65,535 octets. [...] ...but 65535 is the *MTU*, not MSS. And if it needs to fit ETH_MAX_MTU, you're now back to 65520 bytes of MTU (it needs to match IPv4 32-bit words, if I recall correctly), hence 65480 bytes of MSS. IPv6 is different, in that the equivalent field doesn't include the size of the IPv6 header. But the header is 40 bytes long, so the outcome is the same. Well, except that you could have jumbograms (RFC 2675) and exceed 65535 bytes of IPv6 MTU, but that won't work anyway with Ethernet-like interfaces. So, well, I haven't actually tried to send an Ethernet frame to a virtio-net interface that's bigger than 65535 bytes. As far as normative references are concerned, you could send 65549 (65535 bytes maximum IPv4 MTU, plus 14 bytes of 802.3 header on top) bytes. I guess it will be dropped by the kernel, but it's perhaps worth a try. -- Stefano
On 3/13/24 12:37, Stefano Brivio wrote:On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:...We can't combine the definition and declaration because the same definition is used with IPv4 and IPv6 TCP arrays declaration. I'm not sure: the __attribute__ must be used on the structure definition or on the array of structures declaration? Thanks, Laurent@@ -445,133 +413,78 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * tcp_l2_flags_t - TCP header and data to send option flags'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).+ * @th: TCP header + * @opts TCP option flags */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_l2_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +};This should still be aligned (when declared below) with: #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif because yes, now you get the unsigned int alignment for free, but the 32-byte boundary for AVX2 (checksums) not really, so that needs to be there, and then for clarity I would keep the explicit attribute for the non-AVX2 case. By the way, I guess you could still combine the definition and declaration as it was done earlier.
On Wed, 13 Mar 2024 16:20:12 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 3/13/24 12:37, Stefano Brivio wrote:Ah, sorry, of course! By the way of that, I wonder: would it then be possible to have a single copy of payload buffers, that's shared for IPv4 and IPv6? We would probably need to introduce a separate counter, on top of tcp[46]_l2_buf_used (which could become tcp[46]_l2_hdr_used). Well, probably not in scope for this change anyway.On Mon, 11 Mar 2024 14:33:56 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:...We can't combine the definition and declaration because the same definition is used with IPv4 and IPv6 TCP arrays declaration.@@ -445,133 +413,78 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * tcp_l2_flags_t - TCP header and data to send option flags'struct tcp_l2_flags_t - ...', by the way (pre-existing, I see).+ * @th: TCP header + * @opts TCP option flags */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_l2_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +};This should still be aligned (when declared below) with: #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif because yes, now you get the unsigned int alignment for free, but the 32-byte boundary for AVX2 (checksums) not really, so that needs to be there, and then for clarity I would keep the explicit attribute for the non-AVX2 case. By the way, I guess you could still combine the definition and declaration as it was done earlier.I'm not sure: the __attribute__ must be used on the structure definition or on the array of structures declaration?__attribute__ can be used on definitions or declarations depending on the type of attribute itself -- here it would be on the definition, because it affects the definition of the data type itself. Strictly speaking, you can use it on the declaration too, but gcc will ignore it: $ cat attr.c struct a { int b; }; int main() { struct a c __attribute__ ((packed, aligned(32))); return 0; } $ gcc -o attr attr.c attr.c: In function ‘main’: attr.c:7:12: warning: ‘packed’ attribute ignored [-Wattributes] 7 | struct a c __attribute__ ((packed, aligned(32))); | -- Stefano
On 3/13/24 12:37, Stefano Brivio wrote: ...I tried to do something like that but I have a performance drop: static size_t tap_send_iov_passt(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], size_t n) { unsigned int i; uint32_t vnet_len[n]; for (i = 0; i < n; i++) { int j; vnet_len[i] = 0; for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) vnet_len[i] += iov[i][j].iov_len; vnet_len[i] = htonl(vnet_len[i]); iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t); } return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM; } iperf3 -c localhost -p 10001 -t 60 -4 berfore [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 33.0 GBytes 4.72 Gbits/sec 1 sender [ 5] 0.00-60.06 sec 33.0 GBytes 4.72 Gbits/sec receiver after: [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 18.2 GBytes 2.60 Gbits/sec 0 sender [ 5] 0.00-60.07 sec 18.2 GBytes 2.60 Gbits/sec receiver iperf3 -c localhost -p 10001 -t 60 -6 before [ 5] 0.00-60.00 sec 25.5 GBytes 3.65 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 25.5 GBytes 3.65 Gbits/sec receiver after: [ 5] 0.00-60.00 sec 16.1 GBytes 2.31 Gbits/sec 0 sender [ 5] 0.00-60.07 sec 16.1 GBytes 2.31 Gbits/sec receiver Thanks, Laurent@@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames...I would argue that this function prepares frames as well. Maybe: * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames+ * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length.* @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0;This could be initialised in the declaration (yes, it's "reset" at every loop iteration).+ for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))...which would now send a single frame at a time, but actually it can already send everything in one shot because it's using sendmsg(), if you move it outside of the loop and do something like (untested): return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n); > + break; > + } > + > + return i; > + > +} > +
On Thu, 14 Mar 2024 15:07:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 3/13/24 12:37, Stefano Brivio wrote: ...Weird, it looks like doing one sendmsg() per frame results in a higher throughput than one sendmsg() per multiple frames, which sounds rather absurd. Perhaps we should start looking into what perf(1) reports, in terms of both syscall overhead and cache misses. I'll have a look later today or tomorrow -- unless you have other ideas as to why this might happen... -- StefanoI tried to do something like that but I have a performance drop: static size_t tap_send_iov_passt(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], size_t n) { unsigned int i; uint32_t vnet_len[n]; for (i = 0; i < n; i++) { int j; vnet_len[i] = 0; for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) vnet_len[i] += iov[i][j].iov_len; vnet_len[i] = htonl(vnet_len[i]); iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t); } return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM; } iperf3 -c localhost -p 10001 -t 60 -4 berfore [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 33.0 GBytes 4.72 Gbits/sec 1 sender [ 5] 0.00-60.06 sec 33.0 GBytes 4.72 Gbits/sec receiver after: [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 18.2 GBytes 2.60 Gbits/sec 0 sender [ 5] 0.00-60.07 sec 18.2 GBytes 2.60 Gbits/sec receiver@@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames...I would argue that this function prepares frames as well. Maybe: * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames+ * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length.* @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0;This could be initialised in the declaration (yes, it's "reset" at every loop iteration).+ for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))...which would now send a single frame at a time, but actually it can already send everything in one shot because it's using sendmsg(), if you move it outside of the loop and do something like (untested): return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n); > + break; > + } > + > + return i; > + > +} > +
On 3/14/24 16:47, Stefano Brivio wrote:On Thu, 14 Mar 2024 15:07:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:Perhaps in first case we only update one vnet_len and in the second case we have to update an array of vnet_len, so there is an use of more cache lines? Thanks, LaurentOn 3/13/24 12:37, Stefano Brivio wrote: ...Weird, it looks like doing one sendmsg() per frame results in a higher throughput than one sendmsg() per multiple frames, which sounds rather absurd. Perhaps we should start looking into what perf(1) reports, in terms of both syscall overhead and cache misses. I'll have a look later today or tomorrow -- unless you have other ideas as to why this might happen...I tried to do something like that but I have a performance drop: static size_t tap_send_iov_passt(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], size_t n) { unsigned int i; uint32_t vnet_len[n]; for (i = 0; i < n; i++) { int j; vnet_len[i] = 0; for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) vnet_len[i] += iov[i][j].iov_len; vnet_len[i] = htonl(vnet_len[i]); iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t); } return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM; } iperf3 -c localhost -p 10001 -t 60 -4 berfore [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 33.0 GBytes 4.72 Gbits/sec 1 sender [ 5] 0.00-60.06 sec 33.0 GBytes 4.72 Gbits/sec receiver after: [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 18.2 GBytes 2.60 Gbits/sec 0 sender [ 5] 0.00-60.07 sec 18.2 GBytes 2.60 Gbits/sec receiver@@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames...I would argue that this function prepares frames as well. Maybe: * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames+ * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length.* @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank+ * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0;This could be initialised in the declaration (yes, it's "reset" at every loop iteration).+ for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM))...which would now send a single frame at a time, but actually it can already send everything in one shot because it's using sendmsg(), if you move it outside of the loop and do something like (untested): return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n); > + break; > + } > + > + return i; > + > +} > +
On Thu, 14 Mar 2024 16:54:02 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 3/14/24 16:47, Stefano Brivio wrote:Yes, I'm wondering if for example this: iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; causes a prefetch of everything pointed by iov[i][...], so we would prefetch (and throw away) each buffer, one by one. Another interesting experiment to verify if this is the case could be to "flush" a few frames at a time (say, 4), with something like this on top of your original change (completely untested): [...] if (!((i + 1) % 4) && !tap_send_frames_passt(c, iov[i / 4], TCP_IOV_NUM * 4)) break; } if ((i + 1) % 4) { tap_send_frames_passt(c, iov[i / 4], TCP_IOV_NUM * ((i + 1) % 4)); } Or maybe we could set vnet_len right after we receive data in the buffers. -- StefanoOn Thu, 14 Mar 2024 15:07:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:Perhaps in first case we only update one vnet_len and in the second case we have to update an array of vnet_len, so there is an use of more cache lines?On 3/13/24 12:37, Stefano Brivio wrote: ...Weird, it looks like doing one sendmsg() per frame results in a higher throughput than one sendmsg() per multiple frames, which sounds rather absurd. Perhaps we should start looking into what perf(1) reports, in terms of both syscall overhead and cache misses. I'll have a look later today or tomorrow -- unless you have other ideas as to why this might happen...> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, > return i; > } > > +/** > + * tap_send_iov_passt() - Send out multiple prepared frames ...I would argue that this function prepares frames as well. Maybe: * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames > + * @c: Execution context > + * @iov: Array of frames, each frames is divided in an array of iovecs. > + * The first entry of the iovec is updated to point to an > + * uint32_t storing the frame length. * @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank > + * @n: Number of frames in @iov > + * > + * Return: number of frames actually sent > + */ > +static size_t tap_send_iov_passt(const struct ctx *c, > + struct iovec iov[][TCP_IOV_NUM], > + size_t n) > +{ > + unsigned int i; > + > + for (i = 0; i < n; i++) { > + uint32_t vnet_len; > + int j; > + > + vnet_len = 0; This could be initialised in the declaration (yes, it's "reset" at every loop iteration). > + for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) > + vnet_len += iov[i][j].iov_len; > + > + vnet_len = htonl(vnet_len); > + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; > + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); > + > + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM)) ...which would now send a single frame at a time, but actually it can already send everything in one shot because it's using sendmsg(), if you move it outside of the loop and do something like (untested): return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n); > + break; > + } > + > + return i; > + > +} > +I tried to do something like that but I have a performance drop: static size_t tap_send_iov_passt(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], size_t n) { unsigned int i; uint32_t vnet_len[n]; for (i = 0; i < n; i++) { int j; vnet_len[i] = 0; for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) vnet_len[i] += iov[i][j].iov_len; vnet_len[i] = htonl(vnet_len[i]); iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t); } return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM; } iperf3 -c localhost -p 10001 -t 60 -4 berfore [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 33.0 GBytes 4.72 Gbits/sec 1 sender [ 5] 0.00-60.06 sec 33.0 GBytes 4.72 Gbits/sec receiver after: [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-60.00 sec 18.2 GBytes 2.60 Gbits/sec 0 sender [ 5] 0.00-60.07 sec 18.2 GBytes 2.60 Gbits/sec receiver
On Thu, Mar 14, 2024 at 05:26:17PM +0100, Stefano Brivio wrote:On Thu, 14 Mar 2024 16:54:02 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote: > On 3/14/24 16:47, Stefano Brivio wrote: > > On Thu, 14 Mar 2024 15:07:48 +0100 > > Laurent Vivier <lvivier(a)redhat.com> wrote: > > > >> On 3/13/24 12:37, Stefano Brivio wrote: > >> ... > >>>> @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, > >>>> return i; > >>>> } > >>>> > >>>> +/** > >>>> + * tap_send_iov_passt() - Send out multiple prepared frames > >>> > >>> ...I would argue that this function prepares frames as well. Maybe: > >>> > >>> * tap_send_iov_passt() - Prepare TCP_IOV_VNET parts and send multiple frames > >>> > >>>> + * @c: Execution context > >>>> + * @iov: Array of frames, each frames is divided in an array of iovecs. > >>>> + * The first entry of the iovec is updated to point to an > >>>> + * uint32_t storing the frame length. > >>> > >>> * @iov: Array of frames, each one a vector of parts, TCP_IOV_VNET blank > >>> > >>>> + * @n: Number of frames in @iov > >>>> + * > >>>> + * Return: number of frames actually sent > >>>> + */ > >>>> +static size_t tap_send_iov_passt(const struct ctx *c, > >>>> + struct iovec iov[][TCP_IOV_NUM], > >>>> + size_t n) > >>>> +{ > >>>> + unsigned int i; > >>>> + > >>>> + for (i = 0; i < n; i++) { > >>>> + uint32_t vnet_len; > >>>> + int j; > >>>> + > >>>> + vnet_len = 0; > >>> > >>> This could be initialised in the declaration (yes, it's "reset" at > >>> every loop iteration). > >>> > >>>> + for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) > >>>> + vnet_len += iov[i][j].iov_len; > >>>> + > >>>> + vnet_len = htonl(vnet_len); > >>>> + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; > >>>> + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); > >>>> + > >>>> + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM)) > >>> > >>> ...which would now send a single frame at a time, but actually it can > >>> already send everything in one shot because it's using sendmsg(), if you > >>> move it outside of the loop and do something like (untested): > >>> > >>> return tap_send_frames_passt(c, iov, TCP_IOV_NUM * n); > >>> > >>>> + break; > >>>> + } > >>>> + > >>>> + return i; > >>>> + > >>>> +} > >>>> + > >> > >> I tried to do something like that but I have a performance drop: > >> > >> static size_t tap_send_iov_passt(const struct ctx *c, > >> struct iovec iov[][TCP_IOV_NUM], > >> size_t n) > >> { > >> unsigned int i; > >> uint32_t vnet_len[n]; > >> > >> for (i = 0; i < n; i++) { > >> int j; > >> > >> vnet_len[i] = 0; > >> for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) > >> vnet_len[i] += iov[i][j].iov_len; > >> > >> vnet_len[i] = htonl(vnet_len[i]); > >> iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; > >> iov[i][TCP_IOV_VNET].iov_len = sizeof(uint32_t); > >> } > >> > >> return tap_send_frames_passt(c, &iov[0][0], TCP_IOV_NUM * n) / TCP_IOV_NUM; > >> } > >> > >> iperf3 -c localhost -p 10001 -t 60 -4 > >> > >> berfore > >> [ ID] Interval Transfer Bitrate Retr > >> [ 5] 0.00-60.00 sec 33.0 GBytes 4.72 Gbits/sec 1 sender > >> [ 5] 0.00-60.06 sec 33.0 GBytes 4.72 Gbits/sec receiver > >> > >> after: > >> [ ID] Interval Transfer Bitrate Retr > >> [ 5] 0.00-60.00 sec 18.2 GBytes 2.60 Gbits/sec 0 sender > >> [ 5] 0.00-60.07 sec 18.2 GBytes 2.60 Gbits/sec receiver > > > > Weird, it looks like doing one sendmsg() per frame results in a higher > > throughput than one sendmsg() per multiple frames, which sounds rather > > absurd. Perhaps we should start looking into what perf(1) reports, in > > terms of both syscall overhead and cache misses. > > > > I'll have a look later today or tomorrow -- unless you have other > > ideas as to why this might happen... > > Perhaps in first case we only update one vnet_len and in the second case we have to update > an array of vnet_len, so there is an use of more cache lines?We should be able to test this relatively easily, yes? By updating all the vnet_len then using a single sendmsg().Yes, I'm wondering if for example this: iov[i][TCP_IOV_VNET].iov_base = &vnet_len[i]; causes a prefetch of everything pointed by iov[i][...], so we would prefetch (and throw away) each buffer, one by one. Another interesting experiment to verify if this is the case could be to "flush" a few frames at a time (say, 4), with something like this on top of your original change (completely untested): [...] if (!((i + 1) % 4) && !tap_send_frames_passt(c, iov[i / 4], TCP_IOV_NUM * 4)) break; } if ((i + 1) % 4) { tap_send_frames_passt(c, iov[i / 4], TCP_IOV_NUM * ((i + 1) % 4)); } Or maybe we could set vnet_len right after we receive data in the buffers.I really hope we can avoid this. If we want to allow IPv4<->IPv6 translation, then we can't know the vnet_len until after we've done our routing / translation. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On Mon, Mar 11, 2024 at 02:33:56PM +0100, Laurent Vivier wrote:To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. And as the payload buffer contains only the TCP header and the TCP data we can increase the size of the TCP data to USHRT_MAX - sizeof(struct tcphdr). As a side effect, these changes improve performance by a factor of x1.5. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 104 ++++++++++++++ tap.h | 16 +++ tcp.c | 429 ++++++++++++++++++++++++---------------------------------- 3 files changed, 296 insertions(+), 253 deletions(-) diff --git a/tap.c b/tap.c index c7b9372668ec..afd01cffbed6 100644 --- a/tap.c +++ b/tap.c @@ -350,6 +350,30 @@ static size_t tap_send_frames_pasta(const struct ctx *c, return i; } +/** + * tap_send_iov_pasta() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is ignored + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_pasta(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], size_t n)As Stefano also points out, this isn't quite right. At the very least the names are misleading, and I'm pretty sure it also won't work right: the tuntap interface requires a whole frame to be given with a single write()/writev(), you can't split it into multiple writes. I also think it's kind of a layering violation to refer to TCP_IOV_NUM here, so I think the number of iovecs per frame should be a parameter. Which.. is basically what I already did in the tap rework series I already sent. I think it should be pretty straightforward to rebase the TCP part of this patch on top of that, and drop the tap part.+{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (!tap_send_frames_pasta(c, &iov[i][TCP_IOV_ETH], + TCP_IOV_NUM - TCP_IOV_ETH)) + break; + } + + return i; + +} + /** * tap_send_frames_passt() - Send multiple frames to the passt tap * @c: Execution context @@ -390,6 +414,42 @@ static size_t tap_send_frames_passt(const struct ctx *c, return i; } +/** + * tap_send_iov_passt() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * The first entry of the iovec is updated to point to an + * uint32_t storing the frame length. + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +static size_t tap_send_iov_passt(const struct ctx *c, + struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + uint32_t vnet_len; + int j; + + vnet_len = 0; + for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) + vnet_len += iov[i][j].iov_len; + + vnet_len = htonl(vnet_len); + iov[i][TCP_IOV_VNET].iov_base = &vnet_len; + iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); + + if (!tap_send_frames_passt(c, iov[i], TCP_IOV_NUM)) + break; + } + + return i; + +} + /** * tap_send_frames() - Send out multiple prepared frames * @c: Execution context @@ -418,6 +478,50 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n) return m; } +/** + * tap_send_iov() - Send out multiple prepared frames + * @c: Execution context + * @iov: Array of frames, each frames is divided in an array of iovecs. + * iovec array is:As per Stefano, I think making these constants an enum and moving this description there would be preferable.+ * TCP_IOV_VNET (0) vnet lengthMaybe call this TCP_IOV_TAP, to more easily to generalise to any TAP backend specific headers we might want in future?+ * TCP_IOV_ETH (1) ethernet header + * TCP_IOV_IP (2) IP (v4/v6) header + * TCP_IOV_PAYLOAD (3) IP payload (TCP header + data) + * TCP_IOV_NUM (4) is the number of entries in the iovec array + * TCP_IOV_VNET entry is updated with passt, ignored with pasta. + * @n: Number of frames in @iov + * + * Return: number of frames actually sent + */ +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n) +{ + size_t m; + unsigned int i; + + if (!n) + return 0; + + switch (c->mode) { + case MODE_PASST: + m = tap_send_iov_passt(c, iov, n); + break; + case MODE_PASTA: + m = tap_send_iov_pasta(c, iov, n); + break; + default: + ASSERT(0); + } + + if (m < n) + debug("tap: failed to send %zu frames of %zu", n - m, n); + + for (i = 0; i < m; i++) + pcap_iov(&iov[i][TCP_IOV_ETH], TCP_IOV_NUM - TCP_IOV_ETH); + + return m; +} + /** * eth_update_mac() - Update tap L2 header with new Ethernet addresses * @eh: Ethernet headers to update diff --git a/tap.h b/tap.h index 437b9aa2b43f..2d8e7aa1101d 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,20 @@ #ifndef TAP_H #define TAP_H +/* + * TCP frame iovec array: + * TCP_IOV_VNET vnet length + * TCP_IOV_ETH ethernet header + * TCP_IOV_IP IP (v4/v6) header + * TCP_IOV_PAYLOAD IP payload (TCP header + data) + * TCP_IOV_NUM is the number of entries in the iovec array + */ +#define TCP_IOV_VNET 0 +#define TCP_IOV_ETH 1 +#define TCP_IOV_IP 2 +#define TCP_IOV_PAYLOAD 3 +#define TCP_IOV_NUM 4As with using TCP_IOV_NUM etc. inside tap.c, defining these in tap.h rather than tcp.h seems odd to me.+ /** * struct tap_hdr - L2 and tap specific headers * @vnet_len: Frame length (for qemu socket transport) @@ -74,6 +88,8 @@ void tap_icmp6_send(const struct ctx *c, const void *in, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n); +size_t tap_send_iov(const struct ctx *c, struct iovec iov[][TCP_IOV_NUM], + size_t n); void eth_update_mac(struct ethhdr *eh, const unsigned char *eth_d, const unsigned char *eth_s); void tap_listen_handler(struct ctx *c, uint32_t events); diff --git a/tcp.c b/tcp.c index d5eedf4d0138..5c8488108ef7 100644 --- a/tcp.c +++ b/tcp.c @@ -318,39 +318,7 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS (USHRT_MAX - sizeof(struct tcphdr))As Stefano points out, the buffer size isn't the only limit on the MSS here, so this isn't quite right. In particular there's the 16-bit length field in the IPv4 header. I think making the *buffers* be exactly 64k is a good idea (page aligned on nearly everything), but the MSS has to be clamped a bit smaller.#define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +413,78 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * tcp_l2_flags_t - TCP header and data to send option flags + * @th: TCP header + * @opts TCP option flags */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_l2_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +}; +/** + * tcp_l2_payload_t - TCP header and data to send data + * 32 bytes aligned to be able to use AVX2 checksum + * @th: TCP header + * @data: TCP data + */ +struct tcp_l2_payload_t { + struct tcphdr th; /* 20 bytes */ + uint8_t data[MSS]; /* 65516 bytes */Actually 65515 bytes - USHRT_MAX is 65535, not 65536. I think it might be an idea to explicitly set this length to 65536 - sizeof(tcphdr), then static_assert() to make sure our calculated MSS values will fit in here.#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; +/* IPv4 headers */ +static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv4 frames */ +static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; -/** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload - */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_buf[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP option flags frames */ +static struct iphdr tcp4_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv4 frames */ +static struct tcp_l2_flags_t tcp4_l2_flags[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_l2_flags_buf_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; +/* IPv6 headers */ +static struct ipv6hdr tcp6_l2_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_l2_payload_t tcp6_l2_payload[TCP_FRAMES_MEM];We should be able to unify this with the IPv4 payload buffers (and I think we'll want to to allow v4<->v6 NAT). Makes sense to do that as a later followup, though.+ +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; +/* IPv6 headers for TCP option flags frames */ +static struct ipv6hdr tcp6_l2_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv6 frames */ +static struct tcp_l2_flags_t tcp6_l2_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_l2_flags_buf_used; + /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_IOV_NUM]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -973,19 +886,8 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -995,29 +897,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void tcp_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp4_l2_ip[i] = iph; + tcp4_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + };I find C struct literal syntax sufficiently awkward, that I'd probably prefer to just open code setting each field. That's only a very weak preference, though.- for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); + tcp4_l2_flags_ip[i] = iph; + tcp4_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp4_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i]; + + iov = tcp4_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp4_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_flags[i]; + } } /** @@ -1026,29 +941,43 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { - struct iovec *iov; + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + (void)c; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; + + /* headers */ + tcp6_l2_ip[i] = ip6; + tcp6_l2_payload[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); + tcp6_l2_flags_ip[i] = ip6; + tcp6_l2_flags[i].th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); + /* iovecs */ + iov = tcp6_l2_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_payload[i]; + + iov = tcp6_l2_flags_iov[i]; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + iov[TCP_IOV_IP].iov_base = &tcp6_l2_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_l2_flags[i]; + } } /** @@ -1289,10 +1218,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); */ static void tcp_l2_flags_buf_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tap_send_iov(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tap_send_iov(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } @@ -1305,12 +1234,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); + m = tap_send_iov(c, tcp6_l2_iov, tcp6_l2_buf_used); for (i = 0; i < m; i++) *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; tcp6_l2_buf_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); + m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used); for (i = 0; i < m; i++) *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; tcp4_l2_buf_used = 0; @@ -1433,7 +1362,7 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffer * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -1442,26 +1371,22 @@ static size_t tcp_fill_headers6(const struct ctx *c, */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1595,16 +1520,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_l2_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; size_t optlen = 0; struct iovec *iov; + struct iovec *dup_iov; struct tcphdr *th; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,18 +1551,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used++]; + dup_iov = tcp4_l2_flags_iov[tcp4_l2_flags_buf_used]; } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used++]; + dup_iov = tcp6_l2_flags_iov[tcp6_l2_flags_buf_used]; } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; if (flags & SYN) { int mss; @@ -1653,10 +1574,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) mss = tinfo.tcpi_snd_mss; } else { mss = c->mtu - sizeof(struct tcphdr); - if (CONN_V4(conn)) - mss -= sizeof(struct iphdr); - else - mss -= sizeof(struct ipv6hdr); if (c->low_wmem && !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) @@ -1688,8 +1605,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov, + optlen, NULL, + conn->seq_to_tap); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1705,23 +1623,26 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + if (flags & DUP_ACK) { + int i; + for (i = 0; i < TCP_IOV_NUM; i++) { + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[i].iov_len = iov[i].iov_len; + } + } + if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp4_l2_flags_buf_used++; - } - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) + if (tcp4_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; + if (flags & DUP_ACK) tcp6_l2_flags_buf_used++; - } - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) + if (tcp6_l2_flags_buf_used > TCP_FRAMES_MEM - 2) tcp_l2_flags_buf_flush(c); } @@ -1887,15 +1808,14 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, unsigned int mss; int ret; + (void)conn; /* unused */ + if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; - if (CONN_V4(conn)) - mss = MIN(MSS4, mss); - else - mss = MIN(MSS6, mss); + mss = MIN(MSS, mss); return MIN(mss, USHRT_MAX); } @@ -2171,27 +2091,30 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *iov; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_l2_buf_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) + iov = tcp4_l2_iov[tcp4_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, check, seq); + if (tcp4_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) + iov = tcp6_l2_iov[tcp6_l2_buf_used++]; + iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, + iov, plen, NULL, seq); + if (tcp6_l2_buf_used > TCP_FRAMES_MEM - 1) tcp_l2_data_buf_flush(c); } } @@ -2247,8 +2170,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { + if (( v4 && tcp4_l2_buf_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_l2_buf_used + fill_bufs > TCP_FRAMES_MEM)) { tcp_l2_data_buf_flush(c); /* Silence Coverity CWE-125 false positive */ @@ -2257,9 +2180,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_l2_payload[tcp4_l2_buf_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_l2_payload[tcp6_l2_buf_used + i].data; iov->iov_len = mss; } if (iov_rem)-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson