To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Notes: v3: - use (again) a 2d array of iovecs - fix pcap_multiple() call - introduce IP_MAX_MTU to define MSS4 and MSS6 - use VAR[0].data rather than ((TYPE *)0)->data in static_assert() - cleanup tcp_iov_parts comments - use sizeof(VAR) rather than sizeof(TYPE) - fix tcp_l2_buf_fill_headers function comment - don't mix network order and host order in vnet_len - with dup_iov, update only TCP_IOV_PAYLOAD length - remove _l2_ from the buffers names v2: - rebased on top of David's series "Some improvements to the tap send path" - no performance improvement anymore - remove the iovec functions of v1 introduced in tap.c to use new functions from David - don't use an array of array of iovec - fix comments - re-introduce MSS4 and MSS6 - address comments from David and Stefano I have tested only passt, not pasta tcp.c | 524 +++++++++++++++++++++++++++------------------------------ util.h | 3 + 2 files changed, 253 insertions(+), 274 deletions(-) diff --git a/tcp.c b/tcp.c index a1860d10b15f..61533f5fd2f4 100644 --- a/tcp.c +++ b/tcp.c @@ -318,39 +318,8 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS4 ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct iphdr), sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), sizeof(uint32_t)) #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +414,107 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * struct tcp_payload_t - TCP header and data to send data + * 32 bytes aligned to be able to use AVX2 checksum + * @th: TCP header + * @data: TCP data */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[65536 - sizeof(struct tcphdr)]; #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; - -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_buf_used; /** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload + * struct tcp_flags_t - TCP header and data to send option flags + * @th: TCP header + * @opts TCP option flags */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp6_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; + +static uint32_t tcp4_payload_vnet_len[TCP_FRAMES_MEM]; +/* IPv4 headers */ +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv4 frames */ +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp4_payload[0].data)); + +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_payload_used; + +static uint32_t tcp4_flags_vnet_len[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP option flags frames */ +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv4 frames */ +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp4_flags_used; -static unsigned int tcp6_l2_buf_used; +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; + +static uint32_t tcp6_payload_vnet_len[TCP_FRAMES_MEM]; +/* IPv6 headers */ +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; + +static_assert(MSS6 <= sizeof(tcp6_payload[0].data)); + +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp6_payload_used; + +static uint32_t tcp6_flags_vnet_len[TCP_FRAMES_MEM]; +/* IPv6 headers for TCP option flags frames */ +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv6 frames */ +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_flags_used; /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_TAP TAP header + * @TCP_IOV_ETH ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_TAP = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -967,25 +910,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) } /** - * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses + * tcp_update_l2_buf() - Update ethernet header buffers with addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -995,29 +927,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void tcp_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } + /* headers */ + tcp4_payload_ip[i] = iph; + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_payload[i].th.ack = 1; + + tcp4_flags_ip[i] = iph; + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_flags[i].th.ack = 1; - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph); + /* iovecs */ + iov = tcp4_l2_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp4_payload_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph); + iov = tcp4_l2_flags_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp4_flags_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; + } } /** @@ -1026,29 +971,43 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { - struct iovec *iov; + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } + /* headers */ + tcp6_payload_ip[i] = ip6; + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_payload[i].th.ack = 1; + + tcp6_flags_ip[i] = ip6; + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_flags[i].th .ack = 1; - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph); + /* iovecs */ + iov = tcp6_l2_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp6_payload_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph); + iov = tcp6_l2_flags_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp6_flags_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + } } /** @@ -1284,36 +1243,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); } while (0) /** - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) + * tcp_flags_flush() - Send out buffers for segments with no data (flags) * @c: Execution context */ -static void tcp_l2_flags_buf_flush(const struct ctx *c) +static void tcp_flags_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used); - tcp6_l2_flags_buf_used = 0; + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp6_flags_used); + tcp6_flags_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used); - tcp4_l2_flags_buf_used = 0; + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp4_flags_used); + tcp4_flags_used = 0; } /** - * tcp_l2_data_buf_flush() - Send out buffers for segments with data + * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ -static void tcp_l2_data_buf_flush(const struct ctx *c) +static void tcp_payload_flush(const struct ctx *c) { unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used); + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, + tcp6_payload_used); for (i = 0; i < m; i++) - *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; - tcp6_l2_buf_used = 0; + *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + tcp6_payload_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used); + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, + tcp4_payload_used); for (i = 0; i < m; i++) - *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; - tcp4_l2_buf_used = 0; + *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + tcp4_payload_used = 0; } /** @@ -1323,8 +1286,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - tcp_l2_flags_buf_flush(c); - tcp_l2_data_buf_flush(c); + tcp_flags_flush(c); + tcp_payload_flush(c); } /** @@ -1433,35 +1396,31 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffer * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment * - * Return: frame length including L2 headers, host order + * Return: IP payload length, host order */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_frame_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_frame_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1595,16 +1554,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; + uint32_t vnet_len; size_t optlen = 0; - struct iovec *iov; struct tcphdr *th; + struct iovec *iov; + size_t ip_len; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,19 +1586,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr); } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; + if (flags & SYN) { int mss; @@ -1688,8 +1645,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, + conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + + *(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1705,24 +1665,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; - if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; - tcp4_l2_flags_buf_used++; - } + if (flags & DUP_ACK) { + struct iovec *dup_iov; + int i; - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); - } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; - tcp6_l2_flags_buf_used++; - } + if (CONN_V4(conn)) + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); + for (i = 0; i < TCP_NUM_IOVS; i++) + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + } + + if (CONN_V4(conn)) { + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } else { + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); } return 0; @@ -2169,30 +2132,43 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, { uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; + size_t ip_len; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; + const uint16_t *check = NULL; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_data_buf_flush(c); + tcp4_seq_update[tcp4_payload_used].seq = seq_update; + tcp4_seq_update[tcp4_payload_used].len = plen; + + iov = tcp4_l2_iov[tcp4_payload_used++]; + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check, + seq); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + *(uint32_t *)iov[TCP_IOV_TAP].iov_base = + htonl(sizeof(struct ethhdr) + + sizeof(struct iphdr) + + ip_len); + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; + tcp6_seq_update[tcp6_payload_used].seq = seq_update; + tcp6_seq_update[tcp6_payload_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_data_buf_flush(c); + iov = tcp6_l2_iov[tcp6_payload_used++]; + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + *(uint32_t *)iov[TCP_IOV_TAP].iov_base = + htonl(sizeof(struct ethhdr) + + sizeof(struct ipv6hdr) + + ip_len); + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } } @@ -2247,19 +2223,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { - tcp_l2_data_buf_flush(c); + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + tcp_payload_flush(c); /* Silence Coverity CWE-125 false positive */ - tcp4_l2_buf_used = tcp6_l2_buf_used = 0; + tcp4_payload_used = tcp6_payload_used = 0; } for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; iov->iov_len = mss; } if (iov_rem) @@ -2304,7 +2280,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) plen = mss; seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; if (i == send_bufs - 1) plen = last_len; diff --git a/util.h b/util.h index 48f35604df92..0069df34a5d2 100644 --- a/util.h +++ b/util.h @@ -31,6 +31,9 @@ #ifndef ETH_MIN_MTU #define ETH_MIN_MTU 68 #endif +#ifndef IP_MAX_MTU +#define IP_MAX_MTU USHRT_MAX +#endif #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) -- 2.42.0
On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote:To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au>--- Notes: v3: - use (again) a 2d array of iovecs - fix pcap_multiple() call - introduce IP_MAX_MTU to define MSS4 and MSS6 - use VAR[0].data rather than ((TYPE *)0)->data in static_assert() - cleanup tcp_iov_parts comments - use sizeof(VAR) rather than sizeof(TYPE) - fix tcp_l2_buf_fill_headers function comment - don't mix network order and host order in vnet_len - with dup_iov, update only TCP_IOV_PAYLOAD length - remove _l2_ from the buffers names v2: - rebased on top of David's series "Some improvements to the tap send path" - no performance improvement anymore - remove the iovec functions of v1 introduced in tap.c to use new functions from David - don't use an array of array of iovec - fix comments - re-introduce MSS4 and MSS6 - address comments from David and Stefano I have tested only passt, not pasta tcp.c | 524 +++++++++++++++++++++++++++------------------------------ util.h | 3 + 2 files changed, 253 insertions(+), 274 deletions(-) diff --git a/tcp.c b/tcp.c index a1860d10b15f..61533f5fd2f4 100644 --- a/tcp.c +++ b/tcp.c @@ -318,39 +318,8 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 - -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) +#define MSS4 ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct iphdr), sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), sizeof(uint32_t)) #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -445,133 +414,107 @@ struct tcp_buf_seq_update { }; /* Static buffers */ - /** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload + * struct tcp_payload_t - TCP header and data to send data + * 32 bytes aligned to be able to use AVX2 checksum + * @th: TCP header + * @data: TCP data */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[65536 - sizeof(struct tcphdr)]; #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp4_l2_buf[TCP_FRAMES_MEM]; - -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_buf_used; /** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload + * struct tcp_flags_t - TCP header and data to send option flags + * @th: TCP header + * @opts TCP option flags */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; #ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) +} __attribute__ ((packed, aligned(32))); #else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif -tcp6_l2_buf[TCP_FRAMES_MEM]; -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; + +static uint32_t tcp4_payload_vnet_len[TCP_FRAMES_MEM]; +/* IPv4 headers */ +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv4 frames */ +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp4_payload[0].data)); + +static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp4_payload_used; + +static uint32_t tcp4_flags_vnet_len[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP option flags frames */ +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv4 frames */ +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp4_flags_used; -static unsigned int tcp6_l2_buf_used; +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; + +static uint32_t tcp6_payload_vnet_len[TCP_FRAMES_MEM]; +/* IPv6 headers */ +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; + +static_assert(MSS6 <= sizeof(tcp6_payload[0].data)); + +static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; +static unsigned int tcp6_payload_used; + +static uint32_t tcp6_flags_vnet_len[TCP_FRAMES_MEM]; +/* IPv6 headers for TCP option flags frames */ +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; +/* TCP headers and option flags for IPv6 frames */ +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_flags_used; /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_TAP TAP header + * @TCP_IOV_ETH ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_TAP = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -967,25 +910,14 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) } /** - * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses + * tcp_update_l2_buf() - Update ethernet header buffers with addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged */ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - eth_update_mac(&b4->taph.eh, eth_d, eth_s); - eth_update_mac(&b6->taph.eh, eth_d, eth_s); - eth_update_mac(&b4f->taph.eh, eth_d, eth_s); - eth_update_mac(&b6f->taph.eh, eth_d, eth_s); - } + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); } /** @@ -995,29 +927,42 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) static void tcp_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; int i; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } + /* headers */ + tcp4_payload_ip[i] = iph; + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_payload[i].th.ack = 1; + + tcp4_flags_ip[i] = iph; + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_flags[i].th.ack = 1; - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph); + /* iovecs */ + iov = tcp4_l2_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp4_payload_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp4_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph); + iov = tcp4_l2_flags_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp4_flags_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp4_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; + } } /** @@ -1026,29 +971,43 @@ static void tcp_sock4_iov_init(const struct ctx *c) */ static void tcp_sock6_iov_init(const struct ctx *c) { - struct iovec *iov; + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); int i; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct iovec *iov; - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } + /* headers */ + tcp6_payload_ip[i] = ip6; + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_payload[i].th.ack = 1; + + tcp6_flags_ip[i] = ip6; + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_flags[i].th .ack = 1; - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph); + /* iovecs */ + iov = tcp6_l2_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp6_payload_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp6_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph); + iov = tcp6_l2_flags_iov[i]; + iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i]; + iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? + sizeof(tcp6_flags_vnet_len[i]) : 0; + iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; + iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); + iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i]; + iov[TCP_IOV_IP].iov_len = sizeof(tcp6_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + } } /** @@ -1284,36 +1243,40 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); } while (0) /** - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) + * tcp_flags_flush() - Send out buffers for segments with no data (flags) * @c: Execution context */ -static void tcp_l2_flags_buf_flush(const struct ctx *c) +static void tcp_flags_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used); - tcp6_l2_flags_buf_used = 0; + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp6_flags_used); + tcp6_flags_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used); - tcp4_l2_flags_buf_used = 0; + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp4_flags_used); + tcp4_flags_used = 0; } /** - * tcp_l2_data_buf_flush() - Send out buffers for segments with data + * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ -static void tcp_l2_data_buf_flush(const struct ctx *c) +static void tcp_payload_flush(const struct ctx *c) { unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used); + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, + tcp6_payload_used); for (i = 0; i < m; i++) - *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; - tcp6_l2_buf_used = 0; + *tcp6_seq_update[i].seq += tcp6_seq_update[i].len; + tcp6_payload_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used); + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, + tcp4_payload_used); for (i = 0; i < m; i++) - *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; - tcp4_l2_buf_used = 0; + *tcp4_seq_update[i].seq += tcp4_seq_update[i].len; + tcp4_payload_used = 0; } /** @@ -1323,8 +1286,8 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - tcp_l2_flags_buf_flush(c); - tcp_l2_data_buf_flush(c); + tcp_flags_flush(c); + tcp_payload_flush(c); } /** @@ -1433,35 +1396,31 @@ static size_t tcp_fill_headers6(const struct ctx *c, * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers * @c: Execution context * @conn: Connection pointer - * @p: Pointer to any type of TCP pre-cooked buffer + * @iov: Pointer to an array of iovec of TCP pre-cooked buffer * @plen: Payload length (including TCP header options) * @check: Checksum, if already known * @seq: Sequence number for this segment * - * Return: frame length including L2 headers, host order + * Return: IP payload length, host order */ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, const struct tcp_tap_conn *conn, - void *p, size_t plen, + struct iovec *iov, size_t plen, const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; if (a4) { - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; - - ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, + ip_len = tcp_fill_headers4(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, check, seq); - - tlen = tap_frame_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct iphdr); } else { - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - - ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, + ip_len = tcp_fill_headers6(c, conn, iov[TCP_IOV_IP].iov_base, + iov[TCP_IOV_PAYLOAD].iov_base, plen, seq); - - tlen = tap_frame_len(c, &b->taph, ip_len); + tlen = ip_len - sizeof(struct ipv6hdr); } return tlen; @@ -1595,16 +1554,16 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; - struct tcp4_l2_flags_buf_t *b4 = NULL; - struct tcp6_l2_flags_buf_t *b6 = NULL; + struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; + uint32_t vnet_len; size_t optlen = 0; - struct iovec *iov; struct tcphdr *th; + struct iovec *iov; + size_t ip_len; char *data; - void *p; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1627,19 +1586,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - th = &b4->th; - - /* gcc 11.2 would complain on data = (char *)(th + 1); */ - data = b4->opts; + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr); } else { - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - th = &b6->th; - data = b6->opts; + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); } + payload = iov[TCP_IOV_PAYLOAD].iov_base; + th = &payload->th; + data = payload->opts; + if (flags & SYN) { int mss; @@ -1688,8 +1645,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, - NULL, conn->seq_to_tap); + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, + conn->seq_to_tap); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + + *(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len); if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) @@ -1705,24 +1665,27 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; - if (CONN_V4(conn)) { - if (flags & DUP_ACK) { - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; - tcp4_l2_flags_buf_used++; - } + if (flags & DUP_ACK) { + struct iovec *dup_iov; + int i; - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); - } else { - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; - tcp6_l2_flags_buf_used++; - } + if (CONN_V4(conn)) + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); + for (i = 0; i < TCP_NUM_IOVS; i++) + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + } + + if (CONN_V4(conn)) { + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } else { + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); } return 0; @@ -2169,30 +2132,43 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, { uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; + size_t ip_len; if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; + const uint16_t *check = NULL; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_data_buf_flush(c); + tcp4_seq_update[tcp4_payload_used].seq = seq_update; + tcp4_seq_update[tcp4_payload_used].len = plen; + + iov = tcp4_l2_iov[tcp4_payload_used++]; + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check, + seq); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + *(uint32_t *)iov[TCP_IOV_TAP].iov_base = + htonl(sizeof(struct ethhdr) + + sizeof(struct iphdr) + + ip_len); + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; + tcp6_seq_update[tcp6_payload_used].seq = seq_update; + tcp6_seq_update[tcp6_payload_used].len = plen; - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_data_buf_flush(c); + iov = tcp6_l2_iov[tcp6_payload_used++]; + ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = ip_len; + *(uint32_t *)iov[TCP_IOV_TAP].iov_base = + htonl(sizeof(struct ethhdr) + + sizeof(struct ipv6hdr) + + ip_len); + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); } } @@ -2247,19 +2223,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { - tcp_l2_data_buf_flush(c); + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + tcp_payload_flush(c); /* Silence Coverity CWE-125 false positive */ - tcp4_l2_buf_used = tcp6_l2_buf_used = 0; + tcp4_payload_used = tcp6_payload_used = 0; } for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; iov->iov_len = mss; } if (iov_rem) @@ -2304,7 +2280,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) plen = mss; seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; if (i == send_bufs - 1) plen = last_len; diff --git a/util.h b/util.h index 48f35604df92..0069df34a5d2 100644 --- a/util.h +++ b/util.h @@ -31,6 +31,9 @@ #ifndef ETH_MIN_MTU #define ETH_MIN_MTU 68 #endif +#ifndef IP_MAX_MTU +#define IP_MAX_MTU USHRT_MAX +#endif #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y))-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote:To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>[snip]+static_assert(MSS4 <= sizeof(tcp4_payload[0].data));This generates a clang-tidy warning, because apparently the C11 version of static_assert() requires a message - making it optional is a C2x extension. Laurent, I know you're having trouble getting the full testsuite to run (but some ideas on that later today, I hope). But could you please add a "make cppcheck" and "make clang-tidy" to your pre-post routine. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On 3/21/24 02:26, David Gibson wrote:On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote:I'll do. Thank you to have checked that. Are there any other commands to run before to send? Something like scripts/chekpatch.pl we have in QEMU? I have always the same problem with the test suite, even after a "make realclean" in test (see attachment) How to debug? Thanks, LaurentTo be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>[snip]+static_assert(MSS4 <= sizeof(tcp4_payload[0].data));This generates a clang-tidy warning, because apparently the C11 version of static_assert() requires a message - making it optional is a C2x extension. Laurent, I know you're having trouble getting the full testsuite to run (but some ideas on that later today, I hope). But could you please add a "make cppcheck" and "make clang-tidy" to your pre-post routine.
On Thu, Mar 21, 2024 at 09:26:58AM +0100, Laurent Vivier wrote:On 3/21/24 02:26, David Gibson wrote:Huh. Guest kernel can't find root. That's not a fault I was expecting (and alas, won't be helped by the patch I have in the works). What's really weird is I don't see any signs of loading the initrd before that failure, and with the mbuto images we're using we should never *leave* the initrd. Which kind of makes sense - if it's not seeing the initrd, then there is indeed no root to mount. Can you gran the full qemu command line it's trying to use? -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonOn Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote:I'll do. Thank you to have checked that. Are there any other commands to run before to send? Something like scripts/chekpatch.pl we have in QEMU? I have always the same problem with the test suite, even after a "make realclean" in test (see attachment) How to debug?To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>[snip]+static_assert(MSS4 <= sizeof(tcp4_payload[0].data));This generates a clang-tidy warning, because apparently the C11 version of static_assert() requires a message - making it optional is a C2x extension. Laurent, I know you're having trouble getting the full testsuite to run (but some ideas on that later today, I hope). But could you please add a "make cppcheck" and "make clang-tidy" to your pre-post routine.
On 3/21/24 11:55, David Gibson wrote:On Thu, Mar 21, 2024 at 09:26:58AM +0100, Laurent Vivier wrote:My file mbuto seems to be empty: $ cd test/ $ make ./mbuto/mbuto -p ./passt.mem.mbuto -c lz4 -f mbuto.mem.img Applying profile from file ./passt.mem.mbuto depmod: WARNING: could not open modules.builtin.modinfo at /tmp/tmp.0Xr88SNogx/lib/modules/6.7.9-200.fc39.x86_64: No such file or directory Size: bin 4.9M lib 15M kmod 218k total 20M compressed 4.1k KERNEL=/boot/vmlinuz-6.7.9-200.fc39.x86_64 INITRD=/home/lvivier/Projects/passt/test/mbuto.mem.img initramfs mounted at: /tmp/tmp.0Xr88SNogx $ ls -l /home/lvivier/Projects/passt/test/mbuto.mem.img -rw-r--r--. 1 lvivier lvivier 50 Mar 21 13:17 /home/lvivier/Projects/passt/test/mbuto.mem.img $ gzip -d - < /home/lvivier/Projects/passt/test/mbuto.mem.img|hexdump -C 00000000 30 37 30 37 30 31 30 30 30 30 30 30 30 30 30 30 |0707010000000000| 00000010 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 |0000000000000000| 00000020 30 30 30 30 30 30 30 30 30 30 30 30 30 31 30 30 |0000000000000100| 00000030 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 |0000000000000000| * 00000060 30 30 30 30 30 42 30 30 30 30 30 30 30 30 54 52 |00000B00000000TR| 00000070 41 49 4c 45 52 21 21 21 00 00 00 00 00 00 00 00 |AILER!!!........| 00000080 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 00000200 Thanks, LaurentOn 3/21/24 02:26, David Gibson wrote:Huh. Guest kernel can't find root. That's not a fault I was expecting (and alas, won't be helped by the patch I have in the works). What's really weird is I don't see any signs of loading the initrd before that failure, and with the mbuto images we're using we should never *leave* the initrd. Which kind of makes sense - if it's not seeing the initrd, then there is indeed no root to mount. Can you gran the full qemu command line it's trying to use?On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote:I'll do. Thank you to have checked that. Are there any other commands to run before to send? Something like scripts/chekpatch.pl we have in QEMU? I have always the same problem with the test suite, even after a "make realclean" in test (see attachment) How to debug?To be able to provide pointers to TCP headers and IP headers without worrying about alignment in the structure, split the structure into several arrays and point to each part of the frame using an iovec array. Using iovec also allows us to simply ignore the first entry when the vnet length header is not needed. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>[snip]+static_assert(MSS4 <= sizeof(tcp4_payload[0].data));This generates a clang-tidy warning, because apparently the C11 version of static_assert() requires a message - making it optional is a C2x extension. Laurent, I know you're having trouble getting the full testsuite to run (but some ideas on that later today, I hope). But could you please add a "make cppcheck" and "make clang-tidy" to your pre-post routine.
On Thu, Mar 21, 2024 at 01:21:03PM +0100, Laurent Vivier wrote:On 3/21/24 11:55, David Gibson wrote:Welp, that would explain it.On Thu, Mar 21, 2024 at 09:26:58AM +0100, Laurent Vivier wrote:My file mbuto seems to be empty:On 3/21/24 02:26, David Gibson wrote:Huh. Guest kernel can't find root. That's not a fault I was expecting (and alas, won't be helped by the patch I have in the works). What's really weird is I don't see any signs of loading the initrd before that failure, and with the mbuto images we're using we should never *leave* the initrd. Which kind of makes sense - if it's not seeing the initrd, then there is indeed no root to mount. Can you gran the full qemu command line it's trying to use?On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote: > To be able to provide pointers to TCP headers and IP headers without > worrying about alignment in the structure, split the structure into > several arrays and point to each part of the frame using an iovec array. > > Using iovec also allows us to simply ignore the first entry when the > vnet length header is not needed. > > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> [snip] > +static_assert(MSS4 <= sizeof(tcp4_payload[0].data)); This generates a clang-tidy warning, because apparently the C11 version of static_assert() requires a message - making it optional is a C2x extension. Laurent, I know you're having trouble getting the full testsuite to run (but some ideas on that later today, I hope). But could you please add a "make cppcheck" and "make clang-tidy" to your pre-post routine.I'll do. Thank you to have checked that. Are there any other commands to run before to send? Something like scripts/chekpatch.pl we have in QEMU? I have always the same problem with the test suite, even after a "make realclean" in test (see attachment) How to debug?$ cd test/ $ make ./mbuto/mbuto -p ./passt.mem.mbuto -c lz4 -f mbuto.mem.img Applying profile from file ./passt.mem.mbuto depmod: WARNING: could not open modules.builtin.modinfo at /tmp/tmp.0Xr88SNogx/lib/modules/6.7.9-200.fc39.x86_64: No such file or directorySo now we have two mysteries. 1) Why is depmod failing here, and 2) why isn't depmod failing causing mbuto to fail, rather than generating nonsense output.Size: bin 4.9M lib 15M kmod 218k total 20M compressed 4.1k KERNEL=/boot/vmlinuz-6.7.9-200.fc39.x86_64 INITRD=/home/lvivier/Projects/passt/test/mbuto.mem.img initramfs mounted at: /tmp/tmp.0Xr88SNogx $ ls -l /home/lvivier/Projects/passt/test/mbuto.mem.img -rw-r--r--. 1 lvivier lvivier 50 Mar 21 13:17 /home/lvivier/Projects/passt/test/mbuto.mem.img $ gzip -d - < /home/lvivier/Projects/passt/test/mbuto.mem.img|hexdump -C 00000000 30 37 30 37 30 31 30 30 30 30 30 30 30 30 30 30 |0707010000000000| 00000010 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 |0000000000000000| 00000020 30 30 30 30 30 30 30 30 30 30 30 30 30 31 30 30 |0000000000000100| 00000030 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 |0000000000000000| * 00000060 30 30 30 30 30 42 30 30 30 30 30 30 30 30 54 52 |00000B00000000TR| 00000070 41 49 4c 45 52 21 21 21 00 00 00 00 00 00 00 00 |AILER!!!........| 00000080 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 00000200 Thanks, Laurent-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On Thu, Mar 21, 2024 at 11:47:45PM +1100, David Gibson wrote:On Thu, Mar 21, 2024 at 01:21:03PM +0100, Laurent Vivier wrote:Can you get the output of running mbuto with sh -x? That might shed some light.On 3/21/24 11:55, David Gibson wrote:Welp, that would explain it.On Thu, Mar 21, 2024 at 09:26:58AM +0100, Laurent Vivier wrote:My file mbuto seems to be empty:On 3/21/24 02:26, David Gibson wrote: > On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote: > > To be able to provide pointers to TCP headers and IP headers without > > worrying about alignment in the structure, split the structure into > > several arrays and point to each part of the frame using an iovec array. > > > > Using iovec also allows us to simply ignore the first entry when the > > vnet length header is not needed. > > > > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> > > [snip] > > +static_assert(MSS4 <= sizeof(tcp4_payload[0].data)); > > This generates a clang-tidy warning, because apparently the C11 > version of static_assert() requires a message - making it optional is > a C2x extension. > > Laurent, I know you're having trouble getting the full testsuite to > run (but some ideas on that later today, I hope). But could you > please add a "make cppcheck" and "make clang-tidy" to your pre-post > routine. > I'll do. Thank you to have checked that. Are there any other commands to run before to send? Something like scripts/chekpatch.pl we have in QEMU? I have always the same problem with the test suite, even after a "make realclean" in test (see attachment) How to debug?Huh. Guest kernel can't find root. That's not a fault I was expecting (and alas, won't be helped by the patch I have in the works). What's really weird is I don't see any signs of loading the initrd before that failure, and with the mbuto images we're using we should never *leave* the initrd. Which kind of makes sense - if it's not seeing the initrd, then there is indeed no root to mount. Can you gran the full qemu command line it's trying to use?$ cd test/ $ make ./mbuto/mbuto -p ./passt.mem.mbuto -c lz4 -f mbuto.mem.img Applying profile from file ./passt.mem.mbuto depmod: WARNING: could not open modules.builtin.modinfo at /tmp/tmp.0Xr88SNogx/lib/modules/6.7.9-200.fc39.x86_64: No such file or directorySo now we have two mysteries. 1) Why is depmod failing here, and 2) why isn't depmod failing causing mbuto to fail, rather than generating nonsense output.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonSize: bin 4.9M lib 15M kmod 218k total 20M compressed 4.1k KERNEL=/boot/vmlinuz-6.7.9-200.fc39.x86_64 INITRD=/home/lvivier/Projects/passt/test/mbuto.mem.img initramfs mounted at: /tmp/tmp.0Xr88SNogx $ ls -l /home/lvivier/Projects/passt/test/mbuto.mem.img -rw-r--r--. 1 lvivier lvivier 50 Mar 21 13:17 /home/lvivier/Projects/passt/test/mbuto.mem.img $ gzip -d - < /home/lvivier/Projects/passt/test/mbuto.mem.img|hexdump -C 00000000 30 37 30 37 30 31 30 30 30 30 30 30 30 30 30 30 |0707010000000000| 00000010 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 |0000000000000000| 00000020 30 30 30 30 30 30 30 30 30 30 30 30 30 31 30 30 |0000000000000100| 00000030 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 |0000000000000000| * 00000060 30 30 30 30 30 42 30 30 30 30 30 30 30 30 54 52 |00000B00000000TR| 00000070 41 49 4c 45 52 21 21 21 00 00 00 00 00 00 00 00 |AILER!!!........| 00000080 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 00000200 Thanks, Laurent
On 3/21/24 13:51, David Gibson wrote:On Thu, Mar 21, 2024 at 11:47:45PM +1100, David Gibson wrote:Really strange: + /usr/bin/cpio --create -H newc --quiet + /usr/bin/gzip + '[' -n /usr/bin/archivemount ']' + /usr/bin/archivemount /home/lvivier/Projects/passt/test/mbuto.mem.img /tmp/tmp.AXKp3fwxoA + info 'Mounted CPIO archive /home/lvivier/Projects/passt/test/mbuto.mem.img at /tmp/tmp.AXKp3fwxoA' and then: $ find /tmp/tmp.AXKp3fwxoA /tmp/tmp.AXKp3fwxoA /tmp/tmp.AXKp3fwxoA/lib /tmp/tmp.AXKp3fwxoA/lib/modules /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64 /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/kernel /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/kernel/drivers /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/kernel/drivers/net /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/kernel/drivers/net/dummy.ko.xz /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.order /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.builtin /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.dep /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.dep.bin /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.alias /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.alias.bin /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.softdep /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.symbols /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.symbols.bin /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.builtin.bin /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.builtin.alias.bin /tmp/tmp.AXKp3fwxoA/lib/modules/6.7.9-200.fc39.x86_64/modules.devname /tmp/tmp.AXKp3fwxoA/proc /tmp/tmp.AXKp3fwxoA/sys /tmp/tmp.AXKp3fwxoA/tmp /tmp/tmp.AXKp3fwxoA/sbin /tmp/tmp.AXKp3fwxoA/usr /tmp/tmp.AXKp3fwxoA/usr/bin /tmp/tmp.AXKp3fwxoA/usr/bin/bash /tmp/tmp.AXKp3fwxoA/usr/bin/chmod /tmp/tmp.AXKp3fwxoA/usr/bin/mount /tmp/tmp.AXKp3fwxoA/usr/bin/mkdir /tmp/tmp.AXKp3fwxoA/usr/bin/ln /tmp/tmp.AXKp3fwxoA/usr/bin/cat /tmp/tmp.AXKp3fwxoA/usr/bin/grep /tmp/tmp.AXKp3fwxoA/usr/bin/mknod /tmp/tmp.AXKp3fwxoA/usr/bin/sed /tmp/tmp.AXKp3fwxoA/usr/bin/chown /tmp/tmp.AXKp3fwxoA/usr/bin/sleep /tmp/tmp.AXKp3fwxoA/usr/bin/bc /tmp/tmp.AXKp3fwxoA/usr/bin/ls /tmp/tmp.AXKp3fwxoA/usr/bin/ps /tmp/tmp.AXKp3fwxoA/usr/bin/unshare /tmp/tmp.AXKp3fwxoA/usr/bin/cp /tmp/tmp.AXKp3fwxoA/usr/bin/kill /tmp/tmp.AXKp3fwxoA/usr/bin/diff /tmp/tmp.AXKp3fwxoA/usr/bin/head /tmp/tmp.AXKp3fwxoA/usr/bin/tail /tmp/tmp.AXKp3fwxoA/usr/bin/sort /tmp/tmp.AXKp3fwxoA/usr/bin/tr /tmp/tmp.AXKp3fwxoA/usr/bin/tee /tmp/tmp.AXKp3fwxoA/usr/bin/cut /tmp/tmp.AXKp3fwxoA/usr/bin/nm /tmp/tmp.AXKp3fwxoA/usr/bin/which /tmp/tmp.AXKp3fwxoA/usr/lib64 /tmp/tmp.AXKp3fwxoA/usr/lib64/libfakeroot /tmp/tmp.AXKp3fwxoA/usr/lib64/libfakeroot/libfakeroot-tcp.so /tmp/tmp.AXKp3fwxoA/usr/lib64/libfakeroot/libfakeroot-0.so /tmp/tmp.AXKp3fwxoA/usr/sbin /tmp/tmp.AXKp3fwxoA/usr/sbin/ip /tmp/tmp.AXKp3fwxoA/usr/sbin/insmod /tmp/tmp.AXKp3fwxoA/usr/sbin/modprobe /tmp/tmp.AXKp3fwxoA/usr/sbin/chroot /tmp/tmp.AXKp3fwxoA/bin /tmp/tmp.AXKp3fwxoA/bin/sh /tmp/tmp.AXKp3fwxoA/bin/passt.avx2 /tmp/tmp.AXKp3fwxoA/etc /tmp/tmp.AXKp3fwxoA/etc/ld.so.conf /tmp/tmp.AXKp3fwxoA/etc/ld.so.cache /tmp/tmp.AXKp3fwxoA/lib64 /tmp/tmp.AXKp3fwxoA/lib64/libc.so.6 /tmp/tmp.AXKp3fwxoA/lib64/libnss_dns.so.2 /tmp/tmp.AXKp3fwxoA/lib64/libresolv.so.2 /tmp/tmp.AXKp3fwxoA/lib64/ld-linux-x86-64.so.2 /tmp/tmp.AXKp3fwxoA/lib64/libnss_files.so.2 /tmp/tmp.AXKp3fwxoA/lib64/libtinfo.so.6 /tmp/tmp.AXKp3fwxoA/lib64/libbpf.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libelf.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libz.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libzstd.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libmnl.so.0 /tmp/tmp.AXKp3fwxoA/lib64/libcap.so.2 /tmp/tmp.AXKp3fwxoA/lib64/libmount.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libblkid.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libselinux.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libpcre2-8.so.0 /tmp/tmp.AXKp3fwxoA/lib64/liblzma.so.5 /tmp/tmp.AXKp3fwxoA/lib64/libcrypto.so.3 /tmp/tmp.AXKp3fwxoA/lib64/libgcc_s.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libacl.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libattr.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libreadline.so.8 /tmp/tmp.AXKp3fwxoA/lib64/libproc2.so.0 /tmp/tmp.AXKp3fwxoA/lib64/libsystemd.so.0 /tmp/tmp.AXKp3fwxoA/lib64/liblz4.so.1 /tmp/tmp.AXKp3fwxoA/lib64/libbfd-2.40-14.fc39.so /tmp/tmp.AXKp3fwxoA/lib64/libsframe.so.0 /tmp/tmp.AXKp3fwxoA/dev /tmp/tmp.AXKp3fwxoA/dev/console /tmp/tmp.AXKp3fwxoA/dev/kmsg /tmp/tmp.AXKp3fwxoA/dev/null /tmp/tmp.AXKp3fwxoA/dev/ptmx /tmp/tmp.AXKp3fwxoA/dev/random /tmp/tmp.AXKp3fwxoA/dev/urandom /tmp/tmp.AXKp3fwxoA/dev/zero /tmp/tmp.AXKp3fwxoA/init but $ gzip -d -c < /home/lvivier/Projects/passt/test/mbuto.mem.img | cpio -t 1 block Any idea? Thanks, LaurentOn Thu, Mar 21, 2024 at 01:21:03PM +0100, Laurent Vivier wrote:Can you get the output of running mbuto with sh -x? That might shed some light.On 3/21/24 11:55, David Gibson wrote:Welp, that would explain it.On Thu, Mar 21, 2024 at 09:26:58AM +0100, Laurent Vivier wrote: > On 3/21/24 02:26, David Gibson wrote: >> On Wed, Mar 20, 2024 at 05:31:46PM +0100, Laurent Vivier wrote: >>> To be able to provide pointers to TCP headers and IP headers without >>> worrying about alignment in the structure, split the structure into >>> several arrays and point to each part of the frame using an iovec array. >>> >>> Using iovec also allows us to simply ignore the first entry when the >>> vnet length header is not needed. >>> >>> Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> >> >> [snip] >>> +static_assert(MSS4 <= sizeof(tcp4_payload[0].data)); >> >> This generates a clang-tidy warning, because apparently the C11 >> version of static_assert() requires a message - making it optional is >> a C2x extension. >> >> Laurent, I know you're having trouble getting the full testsuite to >> run (but some ideas on that later today, I hope). But could you >> please add a "make cppcheck" and "make clang-tidy" to your pre-post >> routine. >> > > I'll do. > Thank you to have checked that. > Are there any other commands to run before to send? > Something like scripts/chekpatch.pl we have in QEMU? > > I have always the same problem with the test suite, even after a "make > realclean" in test (see attachment) > How to debug? Huh. Guest kernel can't find root. That's not a fault I was expecting (and alas, won't be helped by the patch I have in the works). What's really weird is I don't see any signs of loading the initrd before that failure, and with the mbuto images we're using we should never *leave* the initrd. Which kind of makes sense - if it's not seeing the initrd, then there is indeed no root to mount. Can you gran the full qemu command line it's trying to use?My file mbuto seems to be empty:$ cd test/ $ make ./mbuto/mbuto -p ./passt.mem.mbuto -c lz4 -f mbuto.mem.img Applying profile from file ./passt.mem.mbuto depmod: WARNING: could not open modules.builtin.modinfo at /tmp/tmp.0Xr88SNogx/lib/modules/6.7.9-200.fc39.x86_64: No such file or directorySo now we have two mysteries. 1) Why is depmod failing here, and 2) why isn't depmod failing causing mbuto to fail, rather than generating nonsense output.