Extract buffers management code from tcp.c and move it to tcp_buf.c tcp.c keeps all the generic code and will be also used by the vhost-user functions. Also compare mode to MODE_PASTA, as we will manage vhost-user mode (MODE_VU) like MODE_PASST. v7: - update comments (tap_flush_pools(), func and line) - drop "-L4" in tcp_buf.c header - fix *optlen initializatio - fix tcp4_flags_used/tcp6_flags_used value on error - fix seq_to_tap value v6: - reorder declarations, align summing, add curly brackets - rename tap_handler_all() to tap_handler() - rename packet_add_all_do() to tap_add_packet and remove func and line. - rename pool_flush_all() to tap_flush_pools() - rename tcp_fill_flag_header() to tcp_prepare_flags() - set optlen to 0 in tcp_prepare_flags() v5: - remove: [PATCH v4 01/10] tcp: inline tcp_l2_buf_fill_headers() - merge [PATCH v4 09/10] tcp: remove tap_hdr parameter into tcp: extract buffer management from tcp_send_flag() - update comments v4: - remove "tcp: extract buffer management from tcp_conn_tap_mss()" as the MSS size can be the same between socket and vhost-user. - rename tcp_send_flag() and tcp_data_from_sock() to tcp_buf_send_flag() and tcp_buf_data_from_sock() v3: - add 3 new patches tap: use in->buf_size rather than sizeof(pkt_buf) tcp: remove tap_hdr parameter iov: remove iov_copy() v2: - compare to MODE_PASTA in conf_open_files() too - move taph out of udp_update_hdr4()/udp_update_hdr6() Laurent Vivier (8): tcp: extract buffer management from tcp_send_flag() tcp: move buffers management functions to their own file tap: refactor packets handling functions udp: refactor UDP header update functions udp: rename udp_sock_handler() to udp_buf_sock_handler() vhost-user: compare mode MODE_PASTA and not MODE_PASST iov: remove iov_copy() tap: use in->buf_size rather than sizeof(pkt_buf) Makefile | 5 +- conf.c | 14 +- iov.c | 39 ---- iov.h | 3 - isolation.c | 10 +- passt.c | 4 +- tap.c | 132 ++++++----- tap.h | 3 + tcp.c | 580 ++++--------------------------------------------- tcp_buf.c | 519 +++++++++++++++++++++++++++++++++++++++++++ tcp_buf.h | 16 ++ tcp_internal.h | 96 ++++++++ udp.c | 68 +++--- udp.h | 2 +- 14 files changed, 803 insertions(+), 688 deletions(-) create mode 100644 tcp_buf.c create mode 100644 tcp_buf.h create mode 100644 tcp_internal.h -- 2.45.2
This commit isolates the internal data structure management used for storing data (e.g., tcp4_l2_flags_iov[], tcp6_l2_flags_iov[], tcp4_flags_ip[], tcp4_flags[], ...) from the tcp_send_flag() function. The extracted functionality is relocated to a new function named tcp_fill_flag_header(). tcp_fill_flag_header() is now a generic function that accepts parameters such as struct tcphdr and a data pointer. tcp_send_flag() utilizes this parameter to pass memory pointers from tcp4_l2_flags_iov[] and tcp6_l2_flags_iov[]. This separation sets the stage for utilizing tcp_fill_flag_header() to set the memory provided by the guest via vhost-user in future developments. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 78 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/tcp.c b/tcp.c index dd8d46e08628..e40f42259451 100644 --- a/tcp.c +++ b/tcp.c @@ -1567,24 +1567,25 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** - * tcp_send_flag() - Send segment with flags to tap (no payload) + * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @th: TCP header to update + * @data: buffer to store TCP option + * @optlen: size of the TCP option buffer (output parameter) * - * Return: negative error code on connection reset, 0 otherwise + * Return: < 0 error code on connection reset, + * 0 if there is no flag to send + * 1 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, char *data, + size_t *optlen) { - struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; - size_t optlen = 0; - struct tcphdr *th; - struct iovec *iov; - size_t l4len; - char *data; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1606,20 +1607,12 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) return 0; - if (CONN_V4(conn)) - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - th = &payload->th; - data = payload->opts; - + *optlen = 0; if (flags & SYN) { int mss; /* Options: MSS, NOP and window scale (8 bytes) */ - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; + *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; *data++ = OPT_MSS; *data++ = OPT_MSS_LEN; @@ -1653,17 +1646,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) flags |= ACK; } - th->doff = (sizeof(*th) + optlen) / 4; + th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, - conn->seq_to_tap); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) conn_flag(c, conn, ~ACK_TO_TAP_DUE); @@ -1678,6 +1667,47 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + return 1; +} + +/** + * tcp_send_flag() - Send segment with flags to tap (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + struct iovec *iov; + size_t optlen; + size_t l4len; + uint32_t seq; + int ret; + + if (CONN_V4(conn)) + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) { + if (CONN_V4(conn)) + tcp4_flags_used--; + else + tcp6_flags_used--; + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (flags & DUP_ACK) { struct iovec *dup_iov; int i; -- 2.45.2
On 13/06/2024 14:36, Laurent Vivier wrote:This commit isolates the internal data structure management used for storing data (e.g., tcp4_l2_flags_iov[], tcp6_l2_flags_iov[], tcp4_flags_ip[], tcp4_flags[], ...) from the tcp_send_flag() function. The extracted functionality is relocated to a new function named tcp_fill_flag_header(). tcp_fill_flag_header() is now a generic function that accepts parameters such as struct tcphdr and a data pointer. tcp_send_flag() utilizes this parameter to pass memory pointers from tcp4_l2_flags_iov[] and tcp6_l2_flags_iov[]. This separation sets the stage for utilizing tcp_fill_flag_header() to set the memory provided by the guest via vhost-user in future developments.Forgot to update the comment: s/tcp_fill_flag_header/tcp_prepare_flags/ Could you update on merge? Thanks, LaurentSigned-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tcp.c | 78 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/tcp.c b/tcp.c index dd8d46e08628..e40f42259451 100644 --- a/tcp.c +++ b/tcp.c @@ -1567,24 +1567,25 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, } /** - * tcp_send_flag() - Send segment with flags to tap (no payload) + * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @th: TCP header to update + * @data: buffer to store TCP option + * @optlen: size of the TCP option buffer (output parameter) * - * Return: negative error code on connection reset, 0 otherwise + * Return: < 0 error code on connection reset, + * 0 if there is no flag to send + * 1 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, char *data, + size_t *optlen) { - struct tcp_flags_t *payload; struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; - size_t optlen = 0; - struct tcphdr *th; - struct iovec *iov; - size_t l4len; - char *data; if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) @@ -1606,20 +1607,12 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) return 0; - if (CONN_V4(conn)) - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - th = &payload->th; - data = payload->opts; - + *optlen = 0; if (flags & SYN) { int mss; /* Options: MSS, NOP and window scale (8 bytes) */ - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; + *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; *data++ = OPT_MSS; *data++ = OPT_MSS_LEN; @@ -1653,17 +1646,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) flags |= ACK; } - th->doff = (sizeof(*th) + optlen) / 4; + th->doff = (sizeof(*th) + *optlen) / 4; th->ack = !!(flags & ACK); th->rst = !!(flags & RST); th->syn = !!(flags & SYN); th->fin = !!(flags & FIN); - l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, - conn->seq_to_tap); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) conn_flag(c, conn, ~ACK_TO_TAP_DUE); @@ -1678,6 +1667,47 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (th->fin || th->syn) conn->seq_to_tap++; + return 1; +} + +/** + * tcp_send_flag() - Send segment with flags to tap (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + struct iovec *iov; + size_t optlen; + size_t l4len; + uint32_t seq; + int ret; + + if (CONN_V4(conn)) + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) { + if (CONN_V4(conn)) + tcp4_flags_used--; + else + tcp6_flags_used--; + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (flags & DUP_ACK) { struct iovec *dup_iov; int i;
On Thu, 13 Jun 2024 14:58:42 +0200 Laurent Vivier <lvivier(a)redhat.com> wrote:On 13/06/2024 14:36, Laurent Vivier wrote:Sure. Running tests now... -- StefanoThis commit isolates the internal data structure management used for storing data (e.g., tcp4_l2_flags_iov[], tcp6_l2_flags_iov[], tcp4_flags_ip[], tcp4_flags[], ...) from the tcp_send_flag() function. The extracted functionality is relocated to a new function named tcp_fill_flag_header(). tcp_fill_flag_header() is now a generic function that accepts parameters such as struct tcphdr and a data pointer. tcp_send_flag() utilizes this parameter to pass memory pointers from tcp4_l2_flags_iov[] and tcp6_l2_flags_iov[]. This separation sets the stage for utilizing tcp_fill_flag_header() to set the memory provided by the guest via vhost-user in future developments.Forgot to update the comment: s/tcp_fill_flag_header/tcp_prepare_flags/ Could you update on merge?
Move all the TCP parts using internal buffers to tcp_buf.c and keep generic TCP management functions in tcp.c. Add tcp_internal.h to export needed functions from tcp.c and tcp_buf.h from tcp_buf.c With this change we can use existing TCP functions with a different kind of memory storage as for instance the shared memory provided by the guest via vhost-user. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 5 +- tcp.c | 568 ++----------------------------------------------- tcp_buf.c | 519 ++++++++++++++++++++++++++++++++++++++++++++ tcp_buf.h | 16 ++ tcp_internal.h | 96 +++++++++ 5 files changed, 654 insertions(+), 550 deletions(-) create mode 100644 tcp_buf.c create mode 100644 tcp_buf.h create mode 100644 tcp_internal.h diff --git a/Makefile b/Makefile index e2180b599bdb..09fc461d087e 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h + siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ + udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/tcp.c b/tcp.c index e40f42259451..68524235347c 100644 --- a/tcp.c +++ b/tcp.c @@ -302,28 +302,14 @@ #include "flow.h" #include "flow_table.h" - -#define TCP_FRAMES_MEM 128 -#define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) +#include "tcp_internal.h" +#include "tcp_buf.h" #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) -#define MAX_WS 8 -#define MAX_WINDOW (1 << (16 + (MAX_WS))) - /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 -#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ - sizeof(struct tcphdr) - \ - sizeof(struct iphdr), \ - sizeof(uint32_t)) -#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ - sizeof(struct tcphdr) - \ - sizeof(struct ipv6hdr), \ - sizeof(uint32_t)) - #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND # define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd) @@ -345,33 +331,10 @@ */ #define SOL_TCP IPPROTO_TCP -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) - -#define FIN (1 << 0) -#define SYN (1 << 1) -#define RST (1 << 2) -#define ACK (1 << 4) -/* Flags for internal usage */ -#define DUP_ACK (1 << 5) #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ -#define OPT_EOL 0 -#define OPT_NOP 1 -#define OPT_MSS 2 -#define OPT_MSS_LEN 4 -#define OPT_WS 3 -#define OPT_WS_LEN 3 -#define OPT_SACKP 4 -#define OPT_SACK 5 -#define OPT_TS 8 - #define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) -#define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ (((conn)->events & ESTABLISHED) && \ ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -408,106 +371,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -/* Static buffers */ -/** - * struct tcp_payload_t - TCP header and data to send segments with payload - * @th: TCP header - * @data: TCP data - */ -struct tcp_payload_t { - struct tcphdr th; - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/** - * struct tcp_flags_t - TCP header and data to send zero-length - * segments (flags) - * @th: TCP header - * @opts TCP options - */ -struct tcp_flags_t { - struct tcphdr th; - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/* Ethernet header for IPv4 frames */ -static struct ethhdr tcp4_eth_src; - -static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers */ -static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; -/* TCP segments with payload for IPv4 frames */ -static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; - -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); - -/* References tracking the owner connection of frames in the tap outqueue */ -static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM]; -static unsigned int tcp4_payload_used; - -static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv4 headers for TCP segment without payload */ -static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; -/* TCP segments without payload for IPv4 frames */ -static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp4_flags_used; - -/* Ethernet header for IPv6 frames */ -static struct ethhdr tcp6_eth_src; - -static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers */ -static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; -/* TCP headers and data for IPv6 frames */ -static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; - -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); - -/* References tracking the owner connection of frames in the tap outqueue */ -static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM]; -static unsigned int tcp6_payload_used; - -static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; -/* IPv6 headers for TCP segment without payload */ -static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; -/* TCP segment without payload for IPv6 frames */ -static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; - -static unsigned int tcp6_flags_used; - -/* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; - -/* - * enum tcp_iov_parts - I/O vector parts for one TCP frame - * @TCP_IOV_TAP tap backend specific header - * @TCP_IOV_ETH Ethernet header - * @TCP_IOV_IP IP (v4/v6) header - * @TCP_IOV_PAYLOAD IP payload (TCP header + data) - * @TCP_NUM_IOVS the number of entries in the iovec array - */ -enum tcp_iov_parts { - TCP_IOV_TAP = 0, - TCP_IOV_ETH = 1, - TCP_IOV_IP = 2, - TCP_IOV_PAYLOAD = 3, - TCP_NUM_IOVS -}; - -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +char tcp_buf_discard [MAX_WINDOW]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -552,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLRDHUP; } -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag); -#define conn_flag(c, conn, flag) \ - do { \ - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ - } while (0) - /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events * @c: Execution context @@ -671,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag) +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag) { if (flag & (flag - 1)) { int flag_index = fls(~flag); @@ -722,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c, * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event) +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event) { int prev, new, num = fls(event); @@ -771,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -#define conn_event(c, conn, event) \ - do { \ - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ - conn_event_do(c, conn, event); \ - } while (0) - /** * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint * @conn: Connection pointer @@ -906,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) th->check = csum(th, l4len, sum); } -/** - * tcp_update_l2_buf() - Update Ethernet header buffers with addresses - * @eth_d: Ethernet destination address, NULL if unchanged - * @eth_s: Ethernet source address, NULL if unchanged - */ -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) -{ - eth_update_mac(&tcp4_eth_src, eth_d, eth_s); - eth_update_mac(&tcp6_eth_src, eth_d, eth_s); -} - -/** - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets - * @c: Execution context - */ -static void tcp_sock4_iov_init(const struct ctx *c) -{ - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); - - for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { - tcp4_payload_ip[i] = iph; - tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { - tcp4_flags_ip[i] = iph; - tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp4_flags[i].th.ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp4_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; - } -} - -/** - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets - * @c: Execution context - */ -static void tcp_sock6_iov_init(const struct ctx *c) -{ - struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); - - for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { - tcp6_payload_ip[i] = ip6; - tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_payload[i].th.ack = 1; - } - - for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { - tcp6_flags_ip[i] = ip6; - tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; - tcp6_flags[i].th .ack = 1; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; - } - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - iov = tcp6_l2_flags_iov[i]; - - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; - } -} - /** * tcp_opt_get() - Get option, and value if any, from TCP header * @opts: Pointer to start of TCP options in header @@ -1227,76 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn) return true; } -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); -#define tcp_rst(c, conn) \ - do { \ - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ - tcp_rst_do(c, conn); \ - } while (0) - -/** - * tcp_flags_flush() - Send out buffers for segments with no data (flags) - * @c: Execution context - */ -static void tcp_flags_flush(const struct ctx *c) -{ - tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp6_flags_used); - tcp6_flags_used = 0; - - tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, - tcp4_flags_used); - tcp4_flags_used = 0; -} - -/** - * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission - * @conns: Array of connection pointers corresponding to queued frames - * @frames: Two-dimensional array containing queued frames with sub-iovs - * @num_frames: Number of entries in the two arrays to be compared - */ -static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS], - int num_frames) -{ - int i; - - for (i = 0; i < num_frames; i++) { - const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base; - struct tcp_tap_conn *conn = conns[i]; - uint32_t seq = ntohl(th->seq); - - if (SEQ_LE(conn->seq_to_tap, seq)) - continue; - - conn->seq_to_tap = seq; - } -} - -/** - * tcp_payload_flush() - Send out buffers for segments with data - * @c: Execution context - */ -static void tcp_payload_flush(const struct ctx *c) -{ - size_t m; - - m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, - tcp6_payload_used); - if (m != tcp6_payload_used) { - tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m], - tcp6_payload_used - m); - } - tcp6_payload_used = 0; - - m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, - tcp4_payload_used); - if (m != tcp4_payload_used) { - tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m], - tcp4_payload_used - m); - } - tcp4_payload_used = 0; -} - /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context @@ -1430,10 +1112,10 @@ static size_t tcp_fill_headers6(const struct ctx *c, * * Return: IP payload length, host order */ -static size_t tcp_l2_buf_fill_headers(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct iovec *iov, size_t dlen, - const uint16_t *check, uint32_t seq) +size_t tcp_l2_buf_fill_headers(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct iovec *iov, size_t dlen, + const uint16_t *check, uint32_t seq) { const struct in_addr *a4 = inany_v4(&conn->faddr); @@ -1459,8 +1141,8 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; @@ -1579,9 +1261,9 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * 0 if there is no flag to send * 1 otherwise */ -static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, - int flags, struct tcphdr *th, char *data, - size_t *optlen) +int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, + int flags, struct tcphdr *th, char *data, + size_t *optlen) { struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); @@ -1678,60 +1360,9 @@ static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { - struct tcp_flags_t *payload; - struct iovec *iov; - size_t optlen; - size_t l4len; - uint32_t seq; - int ret; - - if (CONN_V4(conn)) - iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - payload = iov[TCP_IOV_PAYLOAD].iov_base; - - seq = conn->seq_to_tap; - ret = tcp_prepare_flags(c, conn, flags, &payload->th, - payload->opts, &optlen); - if (ret <= 0) { - if (CONN_V4(conn)) - tcp4_flags_used--; - else - tcp6_flags_used--; - return ret; - } - - l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - - if (flags & DUP_ACK) { - struct iovec *dup_iov; - int i; - - if (CONN_V4(conn)) - dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - else - dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - - for (i = 0; i < TCP_NUM_IOVS; i++) - memcpy(dup_iov[i].iov_base, iov[i].iov_base, - iov[i].iov_len); - dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; - } - - if (CONN_V4(conn)) { - if (tcp4_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } else { - if (tcp6_flags_used > TCP_FRAMES_MEM - 2) - tcp_flags_flush(c); - } - - return 0; + return tcp_buf_send_flag(c, conn, flags); } /** @@ -1739,7 +1370,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -2166,49 +1797,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) return 0; } -/** - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer - * @c: Execution context - * @conn: Connection pointer - * @dlen: TCP payload length - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer - * @seq: Sequence number to be sent - */ -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t dlen, int no_csum, uint32_t seq) -{ - struct iovec *iov; - size_t l4len; - - conn->seq_to_tap = seq + dlen; - - if (CONN_V4(conn)) { - struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; - const uint16_t *check = NULL; - - if (no_csum) { - struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; - check = &iph->check; - } - - tcp4_frame_conns[tcp4_payload_used] = conn; - - iov = tcp4_l2_iov[tcp4_payload_used++]; - l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp4_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); - } else if (CONN_V6(conn)) { - tcp6_frame_conns[tcp6_payload_used] = conn; - - iov = tcp6_l2_iov[tcp6_payload_used++]; - l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq); - iov[TCP_IOV_PAYLOAD].iov_len = l4len; - if (tcp6_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); - } -} - /** * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context @@ -2220,123 +1808,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int sendlen, len, dlen, v4 = CONN_V4(conn); - int s = conn->sock, i, ret = 0; - struct msghdr mh_sock = { 0 }; - uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; - struct iovec *iov; - - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - - if (SEQ_LT(already_sent, 0)) { - /* RFC 761, section 2.1. */ - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", - conn->seq_ack_from_tap, conn->seq_to_tap); - conn->seq_to_tap = conn->seq_ack_from_tap; - already_sent = 0; - } - - if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); - return 0; - } - - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); - if (fill_bufs > TCP_FRAMES) { - fill_bufs = TCP_FRAMES; - iov_rem = 0; - } else { - iov_rem = (wnd_scaled - already_sent) % mss; - } - - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - - if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || - (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { - tcp_payload_flush(c); - - /* Silence Coverity CWE-125 false positive */ - tcp4_payload_used = tcp6_payload_used = 0; - } - - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - if (v4) - iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; - else - iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; - iov->iov_len = mss; - } - if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; - - /* Receive into buffers, don't dequeue until acknowledged by guest. */ - do - len = recvmsg(s, &mh_sock, MSG_PEEK); - while (len < 0 && errno == EINTR); - - if (len < 0) - goto err; - - if (!len) { - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_send_flag(c, conn, FIN | ACK))) { - tcp_rst(c, conn); - return ret; - } - - conn_event(c, conn, TAP_FIN_SENT); - } - - return 0; - } - - sendlen = len - already_sent; - if (sendlen <= 0) { - conn_flag(c, conn, STALLED); - return 0; - } - - conn_flag(c, conn, ~STALLED); - - send_bufs = DIV_ROUND_UP(sendlen, mss); - last_len = sendlen - (send_bufs - 1) * mss; - - /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, 0, NULL); - - /* Finally, queue to tap */ - dlen = mss; - seq = conn->seq_to_tap; - for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; - - if (i == send_bufs - 1) - dlen = last_len; - - tcp_data_to_tap(c, conn, dlen, no_csum, seq); - seq += dlen; - } - - conn_flag(c, conn, ACK_FROM_TAP_DUE); - - return 0; - -err: - if (errno != EAGAIN && errno != EWOULDBLOCK) { - ret = -errno; - tcp_rst(c, conn); - } - - return ret; + return tcp_buf_data_from_sock(c, conn); } /** diff --git a/tcp_buf.c b/tcp_buf.c new file mode 100644 index 000000000000..5f4df5bb3b0c --- /dev/null +++ b/tcp_buf.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tcp_buf.c - TCP L2 buffer management functions + * + * Copyright Red Hat + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <errno.h> + +#include <netinet/ip.h> + +#include <linux/tcp.h> + +#include "util.h" +#include "ip.h" +#include "iov.h" +#include "passt.h" +#include "tap.h" +#include "siphash.h" +#include "inany.h" +#include "tcp_conn.h" +#include "tcp_internal.h" +#include "tcp_buf.h" + +#define TCP_FRAMES_MEM 128 +#define TCP_FRAMES \ + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) + +/* Static buffers */ +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/* Ethernet header for IPv4 frames */ +static struct ethhdr tcp4_eth_src; + +static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM]; +/* IPv4 headers */ +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; +/* TCP segments with payload for IPv4 frames */ +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM]; + +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516"); + +/* References tracking the owner connection of frames in the tap outqueue */ +static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM]; +static unsigned int tcp4_payload_used; + +static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM]; +/* IPv4 headers for TCP segment without payload */ +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; +/* TCP segments without payload for IPv4 frames */ +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp4_flags_used; + +/* Ethernet header for IPv6 frames */ +static struct ethhdr tcp6_eth_src; + +static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM]; +/* IPv6 headers */ +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; +/* TCP headers and data for IPv6 frames */ +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM]; + +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516"); + +/* References tracking the owner connection of frames in the tap outqueue */ +static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM]; +static unsigned int tcp6_payload_used; + +static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM]; +/* IPv6 headers for TCP segment without payload */ +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; +/* TCP segment without payload for IPv6 frames */ +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM]; + +static unsigned int tcp6_flags_used; + +/* recvmsg()/sendmsg() data for tap */ +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; +/** + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses + * @eth_d: Ethernet destination address, NULL if unchanged + * @eth_s: Ethernet source address, NULL if unchanged + */ +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) +{ + eth_update_mac(&tcp4_eth_src, eth_d, eth_s); + eth_update_mac(&tcp6_eth_src, eth_d, eth_s); +} + +/** + * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * @c: Execution context + */ +void tcp_sock4_iov_init(const struct ctx *c) +{ + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); + + for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) { + tcp4_payload_ip[i] = iph; + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_payload[i].th.ack = 1; + } + + for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) { + tcp4_flags_ip[i] = iph; + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp4_flags[i].th.ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp4_l2_flags_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]); + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i]; + } +} + +/** + * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets + * @c: Execution context + */ +void tcp_sock6_iov_init(const struct ctx *c) +{ + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6); + + for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) { + tcp6_payload_ip[i] = ip6; + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_payload[i].th.ack = 1; + } + + for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) { + tcp6_flags_ip[i] = ip6; + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4; + tcp6_flags[i].th .ack = 1; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; + } + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + iov = tcp6_l2_flags_iov[i]; + + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]); + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src); + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]); + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i]; + } +} + +/** + * tcp_flags_flush() - Send out buffers for segments with no data (flags) + * @c: Execution context + */ +void tcp_flags_flush(const struct ctx *c) +{ + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp6_flags_used); + tcp6_flags_used = 0; + + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS, + tcp4_flags_used); + tcp4_flags_used = 0; +} + +/** + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission + * @conns: Array of connection pointers corresponding to queued frames + * @frames: Two-dimensional array containing queued frames with sub-iovs + * @num_frames: Number of entries in the two arrays to be compared + */ +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS], + int num_frames) +{ + int i; + + for (i = 0; i < num_frames; i++) { + const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base; + struct tcp_tap_conn *conn = conns[i]; + uint32_t seq = ntohl(th->seq); + + if (SEQ_LE(conn->seq_to_tap, seq)) + continue; + + conn->seq_to_tap = seq; + } +} + +/** + * tcp_payload_flush() - Send out buffers for segments with data + * @c: Execution context + */ +void tcp_payload_flush(const struct ctx *c) +{ + size_t m; + + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS, + tcp6_payload_used); + if (m != tcp6_payload_used) { + tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m], + tcp6_payload_used - m); + } + tcp6_payload_used = 0; + + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS, + tcp4_payload_used); + if (m != tcp4_payload_used) { + tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m], + tcp4_payload_used - m); + } + tcp4_payload_used = 0; +} + +/** + * tcp_buf_send_flag() - Send segment with flags to tap (no payload) + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * + * Return: negative error code on connection reset, 0 otherwise + */ +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + struct tcp_flags_t *payload; + struct iovec *iov; + size_t optlen; + size_t l4len; + uint32_t seq; + int ret; + + if (CONN_V4(conn)) + iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + payload = iov[TCP_IOV_PAYLOAD].iov_base; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, + payload->opts, &optlen); + if (ret <= 0) { + if (CONN_V4(conn)) + tcp4_flags_used--; + else + tcp6_flags_used--; + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (flags & DUP_ACK) { + struct iovec *dup_iov; + int i; + + if (CONN_V4(conn)) + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++]; + else + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++]; + + for (i = 0; i < TCP_NUM_IOVS; i++) + memcpy(dup_iov[i].iov_base, iov[i].iov_base, + iov[i].iov_len); + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len; + } + + if (CONN_V4(conn)) { + if (tcp4_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } else { + if (tcp6_flags_used > TCP_FRAMES_MEM - 2) + tcp_flags_flush(c); + } + + return 0; +} + +/** + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer + * @c: Execution context + * @conn: Connection pointer + * @dlen: TCP payload length + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer + * @seq: Sequence number to be sent + */ +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, + ssize_t dlen, int no_csum, uint32_t seq) +{ + struct iovec *iov; + size_t l4len; + + conn->seq_to_tap = seq + dlen; + + if (CONN_V4(conn)) { + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1]; + const uint16_t *check = NULL; + + if (no_csum) { + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; + check = &iph->check; + } + + tcp4_frame_conns[tcp4_payload_used] = conn; + + iov = tcp4_l2_iov[tcp4_payload_used++]; + l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (tcp4_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); + } else if (CONN_V6(conn)) { + tcp6_frame_conns[tcp6_payload_used] = conn; + + iov = tcp6_l2_iov[tcp6_payload_used++]; + l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; + if (tcp6_payload_used > TCP_FRAMES_MEM - 1) + tcp_payload_flush(c); + } +} + +/** + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: negative on connection reset, 0 otherwise + * + * #syscalls recvmsg + */ +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; + int sendlen, len, dlen, v4 = CONN_V4(conn); + int s = conn->sock, i, ret = 0; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + uint32_t already_sent, seq; + struct iovec *iov; + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + if (fill_bufs > TCP_FRAMES) { + fill_bufs = TCP_FRAMES; + iov_rem = 0; + } else { + iov_rem = (wnd_scaled - already_sent) % mss; + } + + mh_sock.msg_iov = iov_sock; + mh_sock.msg_iovlen = fill_bufs + 1; + + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { + tcp_payload_flush(c); + + /* Silence Coverity CWE-125 false positive */ + tcp4_payload_used = tcp6_payload_used = 0; + } + + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + if (v4) + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data; + else + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data; + iov->iov_len = mss; + } + if (iov_rem) + iov_sock[fill_bufs].iov_len = iov_rem; + + /* Receive into buffers, don't dequeue until acknowledged by guest. */ + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + sendlen = len - already_sent; + if (sendlen <= 0) { + conn_flag(c, conn, STALLED); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + send_bufs = DIV_ROUND_UP(sendlen, mss); + last_len = sendlen - (send_bufs - 1) * mss; + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* Finally, queue to tap */ + dlen = mss; + seq = conn->seq_to_tap; + for (i = 0; i < send_bufs; i++) { + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used; + + if (i == send_bufs - 1) + dlen = last_len; + + tcp_data_to_tap(c, conn, dlen, no_csum, seq); + seq += dlen; + } + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; + +err: + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_buf.h b/tcp_buf.h new file mode 100644 index 000000000000..14be7b945285 --- /dev/null +++ b/tcp_buf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_BUF_H +#define TCP_BUF_H + +void tcp_sock4_iov_init(const struct ctx *c); +void tcp_sock6_iov_init(const struct ctx *c); +void tcp_flags_flush(const struct ctx *c); +void tcp_payload_flush(const struct ctx *c); +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); + +#endif /*TCP_BUF_H */ diff --git a/tcp_internal.h b/tcp_internal.h new file mode 100644 index 000000000000..51aaa16918cf --- /dev/null +++ b/tcp_internal.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_INTERNAL_H +#define TCP_INTERNAL_H + +#define MAX_WS 8 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) + +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct iphdr), \ + sizeof(uint32_t)) +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \ + sizeof(struct tcphdr) - \ + sizeof(struct ipv6hdr), \ + sizeof(uint32_t)) + +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +/* Flags for internal usage */ +#define DUP_ACK (1 << 5) +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_MSS_LEN 4 +#define OPT_WS 3 +#define OPT_WS_LEN 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/* + * enum tcp_iov_parts - I/O vector parts for one TCP frame + * @TCP_IOV_TAP tap backend specific header + * @TCP_IOV_ETH Ethernet header + * @TCP_IOV_IP IP (v4/v6) header + * @TCP_IOV_PAYLOAD IP payload (TCP header + data) + * @TCP_NUM_IOVS the number of entries in the iovec array + */ +enum tcp_iov_parts { + TCP_IOV_TAP = 0, + TCP_IOV_ETH = 1, + TCP_IOV_IP = 2, + TCP_IOV_PAYLOAD = 3, + TCP_NUM_IOVS +}; + +extern char tcp_buf_discard [MAX_WINDOW]; + +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag); +#define conn_flag(c, conn, flag) \ + do { \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + + +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event); +#define conn_event(c, conn, event) \ + do { \ + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); +#define tcp_rst(c, conn) \ + do { \ + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ + tcp_rst_do(c, conn); \ + } while (0) + +size_t tcp_l2_buf_fill_headers(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct iovec *iov, size_t dlen, + const uint16_t *check, uint32_t seq); +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo); +int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags, + struct tcphdr *th, char *data, size_t *optlen); + +#endif /* TCP_INTERNAL_H */ -- 2.45.2
Consolidate pool_tap4() and pool_tap6() into tap_flush_pools(), and tap4_handler() and tap6_handler() into tap_handler(). Create a generic tap_add_packet() to consolidate packet addition logic and reduce code duplication. The purpose is to ease the export of these functions to use them with the vhost-user backend. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 110 ++++++++++++++++++++++++++++++++-------------------------- tap.h | 3 ++ 2 files changed, 64 insertions(+), 49 deletions(-) diff --git a/tap.c b/tap.c index 26084b486519..46ea24a8e793 100644 --- a/tap.c +++ b/tap.c @@ -921,6 +921,58 @@ append: return in->count; } +/** + * tap_flush_pools() - Flush both IPv4 and IPv6 packet pools + */ +void tap_flush_pools(void) +{ + pool_flush(pool_tap4); + pool_flush(pool_tap6); +} + +/** + * tap_handler() - IPv4/IPv6 and ARP packet handler for tap file descriptor + * @c: Execution context + * @now: Current timestamp + */ +void tap_handler(struct ctx *c, const struct timespec *now) +{ + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); +} + +/** + * tap_add_packet() - Queue/capture packet, update notion of guest MAC address + * @c: Execution context + * @l2len: Total L2 packet length + * @p: Packet buffer + */ +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) +{ + const struct ethhdr *eh; + + pcap(p, l2len); + + eh = (struct ethhdr *)p; + + if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { + memcpy(c->mac_guest, eh->h_source, ETH_ALEN); + proto_update_l2_buf(c->mac_guest, NULL); + } + + switch (ntohs(eh->h_proto)) { + case ETH_P_ARP: + case ETH_P_IP: + packet_add(pool_tap4, l2len, p); + break; + case ETH_P_IPV6: + packet_add(pool_tap6, l2len, p); + break; + default: + break; + } +} + /** * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context @@ -947,7 +999,6 @@ static void tap_sock_reset(struct ctx *c) void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now) { - const struct ethhdr *eh; ssize_t n, rem; char *p; @@ -960,8 +1011,7 @@ redo: p = pkt_buf; rem = 0; - pool_flush(pool_tap4); - pool_flush(pool_tap6); + tap_flush_pools(); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); if (n < 0) { @@ -988,38 +1038,18 @@ redo: /* Complete the partial read above before discarding a malformed * frame, otherwise the stream will be inconsistent. */ - if (l2len < (ssize_t)sizeof(*eh) || + if (l2len < (ssize_t)sizeof(struct ethhdr) || l2len > (ssize_t)ETH_MAX_MTU) goto next; - pcap(p, l2len); - - eh = (struct ethhdr *)p; - - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, l2len, p); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, l2len, p); - break; - default: - break; - } + tap_add_packet(c, l2len, p); next: p += l2len; n -= l2len; } - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler(c, now); /* We can't use EPOLLET otherwise. */ if (rem) @@ -1044,35 +1074,18 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, redo: n = 0; - pool_flush(pool_tap4); - pool_flush(pool_tap6); + tap_flush_pools(); restart: while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { - const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); - if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) { + if (len < (ssize_t)sizeof(struct ethhdr) || + len > (ssize_t)ETH_MAX_MTU) { n += len; continue; } - pcap(pkt_buf + n, len); - if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { - memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL); - } - - switch (ntohs(eh->h_proto)) { - case ETH_P_ARP: - case ETH_P_IP: - packet_add(pool_tap4, len, pkt_buf + n); - break; - case ETH_P_IPV6: - packet_add(pool_tap6, len, pkt_buf + n); - break; - default: - break; - } + tap_add_packet(c, len, pkt_buf + n); if ((n += len) == TAP_BUF_BYTES) break; @@ -1083,8 +1096,7 @@ restart: ret = errno; - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + tap_handler(c, now); if (len > 0 || ret == EAGAIN) return; diff --git a/tap.h b/tap.h index 2285a87093f9..d496bd0e4b99 100644 --- a/tap.h +++ b/tap.h @@ -70,5 +70,8 @@ void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); void tap_sock_init(struct ctx *c); +void tap_flush_pools(void); +void tap_handler(struct ctx *c, const struct timespec *now); +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p); #endif /* TAP_H */ -- 2.45.2
This commit refactors the udp_update_hdr4() and udp_update_hdr6() functions to improve code portability by replacing the udp_meta_t parameter with more specific parameters for the IPv4 and IPv6 headers (iphdr/ipv6hdr) and the source socket address (sockaddr_in/sockaddr_in6). It also moves the tap_hdr_update() function call inside the udp_tap_send() function not to have to pass the TAP header to udp_update_hdr4() and udp_update_hdr6() This refactor reduces complexity by making the functions more modular and ensuring that each function operates on more narrowly scoped data structures. This will facilitate future backend introduction like vhost-user. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- udp.c | 60 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/udp.c b/udp.c index 61e106a9f05e..5bef1af969cc 100644 --- a/udp.c +++ b/udp.c @@ -555,7 +555,8 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @c: Execution context - * @bm: Pointer to udp_meta_t to update + * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) + * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload @@ -564,15 +565,16 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, * Return: size of IPv4 payload (UDP header + data) */ static size_t udp_update_hdr4(const struct ctx *c, - struct udp_meta_t *bm, struct udp_payload_t *bp, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { - in_port_t srcport = ntohs(bm->s_in.sa4.sin_port); const struct in_addr dst = c->ip4.addr_seen; - struct in_addr src = bm->s_in.sa4.sin_addr; + in_port_t srcport = ntohs(s_in->sin_port); size_t l4len = dlen + sizeof(bp->uh); - size_t l3len = l4len + sizeof(bm->ip4h); + size_t l3len = l4len + sizeof(*ip4h); + struct in_addr src = s_in->sin_addr; if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && @@ -593,24 +595,24 @@ static size_t udp_update_hdr4(const struct ctx *c, src = c->ip4.gw; } - bm->ip4h.tot_len = htons(l3len); - bm->ip4h.daddr = dst.s_addr; - bm->ip4h.saddr = src.s_addr; - bm->ip4h.check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); + ip4h->tot_len = htons(l3len); + ip4h->daddr = dst.s_addr; + ip4h->saddr = src.s_addr; + ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); - bp->uh.source = bm->s_in.sa4.sin_port; + bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); csum_udp4(&bp->uh, src, dst, bp->data, dlen); - tap_hdr_update(&bm->taph, l3len + sizeof(udp4_eth_hdr)); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram * @c: Execution context - * @bm: Pointer to udp_meta_t to update + * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) + * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload @@ -619,13 +621,14 @@ static size_t udp_update_hdr4(const struct ctx *c, * Return: size of IPv6 payload (UDP header + data) */ static size_t udp_update_hdr6(const struct ctx *c, - struct udp_meta_t *bm, struct udp_payload_t *bp, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { - const struct in6_addr *src = &bm->s_in.sa6.sin6_addr; + const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; - in_port_t srcport = ntohs(bm->s_in.sa6.sin6_port); + in_port_t srcport = ntohs(s_in6->sin6_port); uint16_t l4len = dlen + sizeof(bp->uh); if (IN6_IS_ADDR_LINKLOCAL(src)) { @@ -662,19 +665,18 @@ static size_t udp_update_hdr6(const struct ctx *c, } - bm->ip6h.payload_len = htons(l4len); - bm->ip6h.daddr = *dst; - bm->ip6h.saddr = *src; - bm->ip6h.version = 6; - bm->ip6h.nexthdr = IPPROTO_UDP; - bm->ip6h.hop_limit = 255; + ip6h->payload_len = htons(l4len); + ip6h->daddr = *dst; + ip6h->saddr = *src; + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 255; - bp->uh.source = bm->s_in.sa6.sin6_port; + bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); - bp->uh.len = bm->ip6h.payload_len; + bp->uh.len = ip6h->payload_len; csum_udp6(&bp->uh, src, dst, bp->data, dlen); - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); return l4len; } @@ -707,11 +709,17 @@ static void udp_tap_send(const struct ctx *c, size_t l4len; if (v6) { - l4len = udp_update_hdr6(c, bm, bp, dstport, + l4len = udp_update_hdr6(c, &bm->ip6h, + &bm->s_in.sa6, bp, dstport, udp6_l2_mh_sock[i].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + + sizeof(udp6_eth_hdr)); } else { - l4len = udp_update_hdr4(c, bm, bp, dstport, + l4len = udp_update_hdr4(c, &bm->ip4h, + &bm->s_in.sa4, bp, dstport, udp4_l2_mh_sock[i].msg_len, now); + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + + sizeof(udp4_eth_hdr)); } tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len; } -- 2.45.2
We are going to introduce a variant of the function to use vhost-user buffers rather than passt internal buffers. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- passt.c | 2 +- udp.c | 6 +++--- udp.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/passt.c b/passt.c index dabd86ed8fbd..e747225d9a7c 100644 --- a/passt.c +++ b/passt.c @@ -365,7 +365,7 @@ loop: tcp_timer_handler(&c, ref); break; case EPOLL_TYPE_UDP: - udp_sock_handler(&c, ref, eventmask, &now); + udp_buf_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); diff --git a/udp.c b/udp.c index 5bef1af969cc..225846758924 100644 --- a/udp.c +++ b/udp.c @@ -728,7 +728,7 @@ static void udp_tap_send(const struct ctx *c, } /** - * udp_sock_handler() - Handle new data from socket + * udp_buf_sock_handler() - Handle new data from socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -736,8 +736,8 @@ static void udp_tap_send(const struct ctx *c, * * #syscalls recvmmsg */ -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, - const struct timespec *now) +void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, + const struct timespec *now) { /* For not entirely clear reasons (data locality?) pasta gets * better throughput if we receive tap datagrams one at a diff --git a/udp.h b/udp.h index 9976b6231f1c..5865def20856 100644 --- a/udp.h +++ b/udp.h @@ -9,7 +9,7 @@ #define UDP_TIMER_INTERVAL 1000 /* ms */ void udp_portmap_clear(void); -void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, +void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, -- 2.45.2
As we are going to introduce the MODE_VU that will act like the mode MODE_PASST, compare to MODE_PASTA rather than to add a comparison to MODE_VU when we check for MODE_PASST. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 14 +++++++------- isolation.c | 10 +++++----- passt.c | 2 +- tap.c | 12 ++++++------ tcp_buf.c | 2 +- udp.c | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/conf.c b/conf.c index 3998bfaa69e9..94b3ed6fa659 100644 --- a/conf.c +++ b/conf.c @@ -147,7 +147,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg, if (fwd->mode) goto mode_conflict; - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("'all' port forwarding is only allowed for passt"); fwd->mode = FWD_ALL; @@ -1118,7 +1118,7 @@ static void conf_ugid(char *runas, uid_t *uid, gid_t *gid) */ static void conf_open_files(struct ctx *c) { - if (c->mode == MODE_PASST && c->fd_tap == -1) + if (c->mode != MODE_PASTA && c->fd_tap == -1) c->fd_tap_listen = tap_sock_unix_open(c->sock_path); c->pidfile_fd = pidfile_open(c->pidfile); @@ -1285,7 +1285,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcp_dns = 0; break; case 6: - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--no-dhcp-dns is for passt mode only"); c->no_dhcp_dns = 1; @@ -1297,7 +1297,7 @@ void conf(struct ctx *c, int argc, char **argv) c->no_dhcp_dns_search = 0; break; case 8: - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--no-dhcp-search is for passt mode only"); c->no_dhcp_dns_search = 1; @@ -1352,7 +1352,7 @@ void conf(struct ctx *c, int argc, char **argv) break; case 14: fprintf(stdout, - c->mode == MODE_PASST ? "passt " : "pasta "); + c->mode == MODE_PASTA ? "pasta " : "passt "); fprintf(stdout, VERSION_BLOB); exit(EXIT_SUCCESS); case 15: @@ -1648,7 +1648,7 @@ void conf(struct ctx *c, int argc, char **argv) v6_only = true; break; case '1': - if (c->mode != MODE_PASST) + if (c->mode == MODE_PASTA) die("--one-off is for passt mode only"); if (c->one_off) @@ -1694,7 +1694,7 @@ void conf(struct ctx *c, int argc, char **argv) conf_ugid(runas, &uid, &gid); if (logfile) { - logfile_init(c->mode == MODE_PASST ? "passt" : "pasta", + logfile_init(c->mode == MODE_PASTA ? "pasta" : "passt", logfile, logsize); } diff --git a/isolation.c b/isolation.c index f394e93b8526..ca2c68b52ec7 100644 --- a/isolation.c +++ b/isolation.c @@ -312,7 +312,7 @@ int isolate_prefork(const struct ctx *c) * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody * ever gets around seccomp profiles -- there's no harm in passing it. */ - if (!c->foreground || c->mode == MODE_PASST) + if (!c->foreground || c->mode != MODE_PASTA) flags |= CLONE_NEWPID; if (unshare(flags)) { @@ -379,12 +379,12 @@ void isolate_postfork(const struct ctx *c) prctl(PR_SET_DUMPABLE, 0); - if (c->mode == MODE_PASST) { - prog.len = (unsigned short)ARRAY_SIZE(filter_passt); - prog.filter = filter_passt; - } else { + if (c->mode == MODE_PASTA) { prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); prog.filter = filter_pasta; + } else { + prog.len = (unsigned short)ARRAY_SIZE(filter_passt); + prog.filter = filter_passt; } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || diff --git a/passt.c b/passt.c index e747225d9a7c..a5e2c5a8e151 100644 --- a/passt.c +++ b/passt.c @@ -333,7 +333,7 @@ loop: uint32_t eventmask = events[i].events; trace("%s: epoll event on %s %i (events: 0x%08x)", - c.mode == MODE_PASST ? "passt" : "pasta", + c.mode == MODE_PASTA ? "pasta" : "passt", EPOLL_TYPE_STR(ref.type), ref.fd, eventmask); switch (ref.type) { diff --git a/tap.c b/tap.c index 46ea24a8e793..2a749809eaab 100644 --- a/tap.c +++ b/tap.c @@ -416,10 +416,10 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASST) - m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); - else + if (c->mode == MODE_PASTA) m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); + else + m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -1330,7 +1330,9 @@ void tap_sock_init(struct ctx *c) return; } - if (c->mode == MODE_PASST) { + if (c->mode == MODE_PASTA) { + tap_sock_tun_init(c); + } else { tap_sock_unix_init(c); /* In passt mode, we don't know the guest's MAC address until it @@ -1338,7 +1340,5 @@ void tap_sock_init(struct ctx *c) * first packets will reach it. */ memset(&c->mac_guest, 0xff, sizeof(c->mac_guest)); - } else { - tap_sock_tun_init(c); } } diff --git a/tcp_buf.c b/tcp_buf.c index 5f4df5bb3b0c..ff9f7970ee0c 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -35,7 +35,7 @@ #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) + (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) /* Static buffers */ /** diff --git a/udp.c b/udp.c index 225846758924..dba75d7fecbd 100644 --- a/udp.c +++ b/udp.c @@ -747,7 +747,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve * whether we'll use tap or splice, always go one at a time * for pasta mode. */ - ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1); + ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); in_port_t dstport = ref.udp.port; bool v6 = ref.udp.v6; struct mmsghdr *mmh_recv; -- 2.45.2
it was needed by a draft version of vhost-user, it is not needed anymore. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- iov.c | 39 --------------------------------------- iov.h | 3 --- 2 files changed, 42 deletions(-) diff --git a/iov.c b/iov.c index 52a7c014a171..3f9e229a305f 100644 --- a/iov.c +++ b/iov.c @@ -156,42 +156,3 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) return len; } - -/** - * iov_copy - Copy data from one scatter/gather I/O vector (struct iovec) to - * another. - * - * @dst_iov: Pointer to the destination array of struct iovec describing - * the scatter/gather I/O vector to copy to. - * @dst_iov_cnt: Number of elements in the destination iov array. - * @iov: Pointer to the source array of struct iovec describing - * the scatter/gather I/O vector to copy from. - * @iov_cnt: Number of elements in the source iov array. - * @offset: Offset within the source iov from where copying should start. - * @bytes: Total number of bytes to copy from iov to dst_iov. - * - * Returns: The number of elements successfully copied to the destination - * iov array. - */ -/* cppcheck-suppress unusedFunction */ -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes) -{ - unsigned int i, j; - - i = iov_skip_bytes(iov, iov_cnt, offset, &offset); - - /* copying data */ - for (j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { - size_t len = MIN(bytes, iov[i].iov_len - offset); - - dst_iov[j].iov_base = (char *)iov[i].iov_base + offset; - dst_iov[j].iov_len = len; - j++; - bytes -= len; - offset = 0; - } - - return j; -} diff --git a/iov.h b/iov.h index 5668ca5f93bc..a9e1722713b3 100644 --- a/iov.h +++ b/iov.h @@ -28,7 +28,4 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); -unsigned iov_copy(struct iovec *dst_iov, size_t dst_iov_cnt, - const struct iovec *iov, size_t iov_cnt, - size_t offset, size_t bytes); #endif /* IOVEC_H */ -- 2.45.2
buf_size is set to sizeof(pkt_buf) by default. And it seems more correct to provide the actual size of the buffer. Later a buf_size of 0 will allow vhost-user mode to detect guest memory buffers. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tap.c b/tap.c index 2a749809eaab..c9aeff19f177 100644 --- a/tap.c +++ b/tap.c @@ -602,7 +602,7 @@ resume: if (!eh) continue; if (ntohs(eh->h_proto) == ETH_P_ARP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l2len, (char *)eh); arp(c, pkt); @@ -642,7 +642,7 @@ resume: continue; if (iph->protocol == IPPROTO_ICMP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; @@ -661,7 +661,7 @@ resume: continue; if (iph->protocol == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l2len, (char *)eh); if (dhcp(c, pkt)) @@ -810,7 +810,7 @@ resume: } if (proto == IPPROTO_ICMPV6) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); if (c->no_icmp) continue; @@ -834,7 +834,7 @@ resume: uh = (struct udphdr *)l4h; if (proto == IPPROTO_UDP) { - PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + PACKET_POOL_P(pkt, 1, in->buf, in->buf_size); packet_add(pkt, l4len, l4h); -- 2.45.2
On Thu, 13 Jun 2024 14:36:47 +0200 Laurent Vivier <lvivier(a)redhat.com> wrote:Extract buffers management code from tcp.c and move it to tcp_buf.c tcp.c keeps all the generic code and will be also used by the vhost-user functions. Also compare mode to MODE_PASTA, as we will manage vhost-user mode (MODE_VU) like MODE_PASST. v7: - update comments (tap_flush_pools(), func and line) - drop "-L4" in tcp_buf.c header - fix *optlen initializatio - fix tcp4_flags_used/tcp6_flags_used value on error - fix seq_to_tap valueApplied! -- Stefano