Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 7 +- tcp.c | 618 ++----------------------------------------------- tcp_buf.c | 569 +++++++++++++++++++++++++++++++++++++++++++++ tcp_buf.h | 17 ++ tcp_internal.h | 78 +++++++ 5 files changed, 689 insertions(+), 600 deletions(-) create mode 100644 tcp_buf.c create mode 100644 tcp_buf.h create mode 100644 tcp_internal.h diff --git a/Makefile b/Makefile index acf37f5a2036..bf370b6ec2e6 100644 --- a/Makefile +++ b/Makefile @@ -46,8 +46,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \ igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \ - passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c udp.c \ - util.c iov.c ip.c + passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \ + tcp_buf.c udp.c util.c iov.c ip.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \ flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \ netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \ - tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h iov.h ip.h + tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \ + util.h iov.h ip.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/tcp.c b/tcp.c index 640209533772..54c15087d678 100644 --- a/tcp.c +++ b/tcp.c @@ -300,57 +300,19 @@ #include "flow.h" #include "flow_table.h" +#include "tcp_internal.h" +#include "tcp_buf.h" /* Sides of a flow as we use them in "tap" connections */ #define SOCKSIDE 0 #define TAPSIDE 1 -#define TCP_FRAMES_MEM 128 -#define TCP_FRAMES \ - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) - #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) -#define MAX_WS 8 -#define MAX_WINDOW (1 << (16 + (MAX_WS))) - /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 -struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[26]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct iphdr iph; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ -#ifdef __AVX2__ - uint8_t pad[14]; -#else - uint8_t pad[2]; -#endif - struct tap_hdr taph; - struct ipv6hdr ip6h; - struct tcphdr th; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) -#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) - #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND # define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd) @@ -372,31 +334,9 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ */ #define SOL_TCP IPPROTO_TCP -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) - -#define FIN (1 << 0) -#define SYN (1 << 1) -#define RST (1 << 2) -#define ACK (1 << 4) -/* Flags for internal usage */ -#define DUP_ACK (1 << 5) #define ACK_IF_NEEDED 0 /* See tcp_buf_send_flag() */ -#define OPT_EOL 0 -#define OPT_NOP 1 -#define OPT_MSS 2 -#define OPT_MSS_LEN 4 -#define OPT_WS 3 -#define OPT_WS_LEN 3 -#define OPT_SACKP 4 -#define OPT_SACK 5 -#define OPT_TS 8 - -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) -#define CONN_V6(conn) (!CONN_V4(conn)) + #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -433,144 +373,11 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -/** - * tcp_buf_seq_update - Sequences to update with length of frames once sent - * @seq: Pointer to sequence number sent to tap-side, to be updated - * @len: TCP payload length - */ -struct tcp_buf_seq_update { - uint32_t *seq; - uint16_t len; -}; - -/* Static buffers */ - -/** - * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @uh: Headroom for TCP header - * @data: Storage for TCP payload - */ -static struct tcp4_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - uint8_t data[MSS4]; /* 84 60 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_buf[TCP_FRAMES_MEM]; - -static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_buf_used; - -/** - * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @data: Storage for TCP payload - */ -struct tcp6_l2_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th; /* 72 60 */ - uint8_t data[MSS6]; /* 92 80 */ - /* 65536 65532 */ -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_buf[TCP_FRAMES_MEM]; - -static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_buf_used; - -/* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; -static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; - -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; +char tcp_buf_discard [MAX_WINDOW]; /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; -/** - * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only - * @taph: Tap-level headers (partially pre-filled) - * @iph: Pre-filled IP header (except for tot_len and saddr) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp4_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[26]; /* 0, align th to 32 bytes */ -#else - uint8_t pad[2]; /* align iph to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 26 2 */ - struct iphdr iph; /* 44 20 */ - struct tcphdr th; /* 64 40 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp4_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp4_l2_flags_buf_used; - -/** - * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) - * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B - * @taph: Tap-level headers (partially pre-filled) - * @ip6h: Pre-filled IP header (except for payload_len and addresses) - * @th: Headroom for TCP header - * @opts: Headroom for TCP options - */ -static struct tcp6_l2_flags_buf_t { -#ifdef __AVX2__ - uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ -#else - uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ -#endif - struct tap_hdr taph; /* 14 2 */ - struct ipv6hdr ip6h; /* 32 20 */ - struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -tcp6_l2_flags_buf[TCP_FRAMES_MEM]; - -static unsigned int tcp6_l2_flags_buf_used; - #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ @@ -611,14 +418,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLRDHUP; } -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag); -#define conn_flag(c, conn, flag) \ - do { \ - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ - } while (0) - /** * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events * @c: Execution context @@ -730,8 +529,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag) +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag) { if (flag & (flag - 1)) { int flag_index = fls(~flag); @@ -781,8 +580,8 @@ static void tcp_hash_remove(const struct ctx *c, * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event) +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event) { int prev, new, num = fls(event); @@ -830,12 +629,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -#define conn_event(c, conn, event) \ - do { \ - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ - conn_event_do(c, conn, event); \ - } while (0) - /** * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint * @conn: Connection pointer @@ -959,91 +752,6 @@ static uint16_t tcp_update_check_tcp6(struct ipv6hdr *ip6h) return csum(th, ntohs(ip6h->payload_len), sum); } -/** - * tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses - * @eth_d: Ethernet destination address, NULL if unchanged - * @eth_s: Ethernet source address, NULL if unchanged - */ -void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s) -{ - int i; - - for (i = 0; i < TCP_FRAMES_MEM; i++) { - struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; - struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; - struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; - struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; - - tap_update_mac(&b4->taph, eth_d, eth_s); - tap_update_mac(&b6->taph, eth_d, eth_s); - tap_update_mac(&b4f->taph, eth_d, eth_s); - tap_update_mac(&b6f->taph, eth_d, eth_s); - } -} - -/** - * tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets - * @c: Execution context - */ -static void tcp_buf_sock4_iov_init(const struct ctx *c) -{ - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); - struct iovec *iov; - int i; - - for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { - tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = iph, - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } - - for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { - tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) - }; - } - - for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); - - for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); -} - -/** - * tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets - * @c: Execution context - */ -static void tcp_buf_sock6_iov_init(const struct ctx *c) -{ - struct iovec *iov; - int i; - - for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { - tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), - .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } - }; - } - - for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { - tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { - .taph = TAP_HDR_INIT(ETH_P_IPV6), - .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) - }; - } - - for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); - - for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); -} - /** * tcp_opt_get() - Get option, and value if any, from TCP header * @opts: Pointer to start of TCP options in header @@ -1269,46 +977,6 @@ bool tcp_flow_defer(union flow *flow) return true; } -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); -#define tcp_rst(c, conn) \ - do { \ - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ - tcp_rst_do(c, conn); \ - } while (0) - -/** - * tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags) - * @c: Execution context - */ -static void tcp_buf_l2_flags_flush(const struct ctx *c) -{ - tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); - tcp6_l2_flags_buf_used = 0; - - tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); - tcp4_l2_flags_buf_used = 0; -} - -/** - * tcp_buf_l2_data_flush() - Send out buffers for segments with data - * @c: Execution context - */ -static void tcp_buf_l2_data_flush(const struct ctx *c) -{ - unsigned i; - size_t m; - - m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); - for (i = 0; i < m; i++) - *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; - tcp6_l2_buf_used = 0; - - m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); - for (i = 0; i < m; i++) - *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; - tcp4_l2_buf_used = 0; -} - /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context @@ -1348,10 +1016,10 @@ static void tcp_set_tcp_header(struct tcphdr *th, * Return: IP frame length including L2 headers, host order */ -static size_t ipv4_fill_headers(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct iphdr *iph, size_t plen, - const uint16_t *check, uint32_t seq) +size_t ipv4_fill_headers(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct iphdr *iph, size_t plen, + const uint16_t *check, uint32_t seq) { struct tcphdr *th = (void *)(iph + 1); const struct in_addr *a4 = inany_v4(&conn->faddr); @@ -1382,10 +1050,10 @@ static size_t ipv4_fill_headers(const struct ctx *c, * Return: IP frame length including L2 headers, host order */ -static size_t ipv6_fill_headers(const struct ctx *c, - const struct tcp_tap_conn *conn, - struct ipv6hdr *ip6h, size_t plen, - uint32_t seq) +size_t ipv6_fill_headers(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct ipv6hdr *ip6h, size_t plen, + uint32_t seq) { struct tcphdr *th = (void *)(ip6h + 1); size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); @@ -1423,8 +1091,8 @@ static size_t ipv6_fill_headers(const struct ctx *c, * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - int force_seq, struct tcp_info *tinfo) +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; @@ -1539,7 +1207,8 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * Return: negative error code on connection reset, 0 otherwise */ -static int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t optlen) +int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, + struct tcphdr *th, char *data, size_t optlen) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; @@ -1629,77 +1298,13 @@ static int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, return 1; } -static int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) -{ - size_t optlen = 0; - struct iovec *iov; - size_t ip_len; - int ret; - - /* Options: MSS, NOP and window scale (8 bytes) */ - if (flags & SYN) - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; - - if (CONN_V4(conn)) { - struct tcp4_l2_flags_buf_t *b4; - - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; - b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; - - ret = do_tcp_send_flag(c, conn, flags, &b4->th, b4->opts, - optlen); - if (ret <= 0) - return ret; - - ip_len = ipv4_fill_headers(c, conn, &b4->iph, optlen, - NULL, conn->seq_to_tap); - - iov->iov_len = tap_iov_len(c, &b4->taph, ip_len); - - if (flags & DUP_ACK) { - - memcpy(b4 + 1, b4, sizeof(*b4)); - (iov + 1)->iov_len = iov->iov_len; - tcp4_l2_flags_buf_used++; - } - - if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_buf_l2_flags_flush(c); - } else { - struct tcp6_l2_flags_buf_t *b6; - - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; - b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; - - ret = do_tcp_send_flag(c, conn, flags, &b6->th, b6->opts, - optlen); - if (ret <= 0) - return ret; - - ip_len = ipv6_fill_headers(c, conn, &b6->ip6h, optlen, - conn->seq_to_tap); - - iov->iov_len = tap_iov_len(c, &b6->taph, ip_len); - - if (flags & DUP_ACK) { - memcpy(b6 + 1, b6, sizeof(*b6)); - (iov + 1)->iov_len = iov->iov_len; - tcp6_l2_flags_buf_used++; - } - - if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_buf_l2_flags_flush(c); - } - - return 0; -} /** * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -1813,14 +1418,6 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) return s; } -static uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn) -{ - if (CONN_V4(conn)) - return MSS4; - - return MSS6; -} - /** * tcp_conn_tap_mss() - Get MSS value advertised by tap/guest * @conn: Connection pointer @@ -2072,179 +1669,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) return 0; } -/** - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer - * @c: Execution context - * @conn: Connection pointer - * @plen: Payload length at L4 - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer - * @seq: Sequence number to be sent - */ -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t plen, int no_csum, uint32_t seq) -{ - uint32_t *seq_update = &conn->seq_to_tap; - struct iovec *iov; - size_t ip_len; - - if (CONN_V4(conn)) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; - - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; - tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; - - ip_len = ipv4_fill_headers(c, conn, &b->iph, plen, - check, seq); - - iov = tcp4_l2_iov + tcp4_l2_buf_used++; - iov->iov_len = tap_iov_len(c, &b->taph, ip_len); - if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_buf_l2_data_flush(c); - } else if (CONN_V6(conn)) { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; - tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; - - ip_len = ipv6_fill_headers(c, conn, &b->ip6h, plen, seq); - - iov = tcp6_l2_iov + tcp6_l2_buf_used++; - iov->iov_len = tap_iov_len(c, &b->taph, ip_len); - if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_buf_l2_data_flush(c); - } -} - -/** - * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window - * @c: Execution context - * @conn: Connection pointer - * - * Return: negative on connection reset, 0 otherwise - * - * #syscalls recvmsg - */ -static int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) -{ - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; - int sendlen, len, plen, v4 = CONN_V4(conn); - int s = conn->sock, i, ret = 0; - struct msghdr mh_sock = { 0 }; - uint16_t mss = MSS_GET(conn); - uint32_t already_sent, seq; - struct iovec *iov; - - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; - - if (SEQ_LT(already_sent, 0)) { - /* RFC 761, section 2.1. */ - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", - conn->seq_ack_from_tap, conn->seq_to_tap); - conn->seq_to_tap = conn->seq_ack_from_tap; - already_sent = 0; - } - - if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); - return 0; - } - - /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); - if (fill_bufs > TCP_FRAMES) { - fill_bufs = TCP_FRAMES; - iov_rem = 0; - } else { - iov_rem = (wnd_scaled - already_sent) % mss; - } - - mh_sock.msg_iov = iov_sock; - mh_sock.msg_iovlen = fill_bufs + 1; - - iov_sock[0].iov_base = tcp_buf_discard; - iov_sock[0].iov_len = already_sent; - - if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || - (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { - tcp_buf_l2_data_flush(c); - - /* Silence Coverity CWE-125 false positive */ - tcp4_l2_buf_used = tcp6_l2_buf_used = 0; - } - - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { - if (v4) - iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; - else - iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; - iov->iov_len = mss; - } - if (iov_rem) - iov_sock[fill_bufs].iov_len = iov_rem; - - /* Receive into buffers, don't dequeue until acknowledged by guest. */ - do - len = recvmsg(s, &mh_sock, MSG_PEEK); - while (len < 0 && errno == EINTR); - - if (len < 0) - goto err; - - if (!len) { - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { - tcp_rst(c, conn); - return ret; - } - - conn_event(c, conn, TAP_FIN_SENT); - } - - return 0; - } - - sendlen = len - already_sent; - if (sendlen <= 0) { - conn_flag(c, conn, STALLED); - return 0; - } - - conn_flag(c, conn, ~STALLED); - - send_bufs = DIV_ROUND_UP(sendlen, mss); - last_len = sendlen - (send_bufs - 1) * mss; - - /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, 0, NULL); - - /* Finally, queue to tap */ - plen = mss; - seq = conn->seq_to_tap; - for (i = 0; i < send_bufs; i++) { - int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; - - if (i == send_bufs - 1) - plen = last_len; - - tcp_data_to_tap(c, conn, plen, no_csum, seq); - seq += plen; - } - - conn_flag(c, conn, ACK_FROM_TAP_DUE); - - return 0; - -err: - if (errno != EAGAIN && errno != EWOULDBLOCK) { - ret = -errno; - tcp_rst(c, conn); - } - - return ret; -} /** * tcp_data_from_tap() - tap/guest data for established connection diff --git a/tcp_buf.c b/tcp_buf.c new file mode 100644 index 000000000000..d70e7f810e4a --- /dev/null +++ b/tcp_buf.c @@ -0,0 +1,569 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tcp_buf.c - TCP L2-L4 translation state machine + * + * Copyright (c) 2020-2022 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <errno.h> + +#include <netinet/ip.h> + +#include <linux/tcp.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "tap.h" +#include "siphash.h" +#include "inany.h" +#include "tcp_conn.h" +#include "tcp_internal.h" +#include "tcp_buf.h" + +#define TCP_FRAMES_MEM 128 +#define TCP_FRAMES \ + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) + +struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ +#ifdef __AVX2__ + uint8_t pad[26]; +#else + uint8_t pad[2]; +#endif + struct tap_hdr taph; + struct iphdr iph; + struct tcphdr th; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ +#ifdef __AVX2__ + uint8_t pad[14]; +#else + uint8_t pad[2]; +#endif + struct tap_hdr taph; + struct ipv6hdr ip6h; + struct tcphdr th; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +#define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) +#define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) + +/** + * tcp_buf_seq_update - Sequences to update with length of frames once sent + * @seq: Pointer to sequence number sent to tap-side, to be updated + * @len: TCP payload length + */ +struct tcp_buf_seq_update { + uint32_t *seq; + uint16_t len; +}; + +/* Static buffers */ + +/** + * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections + * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only + * @taph: Tap-level headers (partially pre-filled) + * @iph: Pre-filled IP header (except for tot_len and saddr) + * @uh: Headroom for TCP header + * @data: Storage for TCP payload + */ +static struct tcp4_l2_buf_t { +#ifdef __AVX2__ + uint8_t pad[26]; /* 0, align th to 32 bytes */ +#else + uint8_t pad[2]; /* align iph to 4 bytes 0 */ +#endif + struct tap_hdr taph; /* 26 2 */ + struct iphdr iph; /* 44 20 */ + struct tcphdr th; /* 64 40 */ + uint8_t data[MSS4]; /* 84 60 */ + /* 65536 65532 */ +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))) +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +#endif +tcp4_l2_buf[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; + +static unsigned int tcp4_l2_buf_used; + +/** + * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections + * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B + * @taph: Tap-level headers (partially pre-filled) + * @ip6h: Pre-filled IP header (except for payload_len and addresses) + * @th: Headroom for TCP header + * @data: Storage for TCP payload + */ +struct tcp6_l2_buf_t { +#ifdef __AVX2__ + uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ +#else + uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ +#endif + struct tap_hdr taph; /* 14 2 */ + struct ipv6hdr ip6h; /* 32 20 */ + struct tcphdr th; /* 72 60 */ + uint8_t data[MSS6]; /* 92 80 */ + /* 65536 65532 */ +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))) +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +#endif +tcp6_l2_buf[TCP_FRAMES_MEM]; + +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; + +static unsigned int tcp6_l2_buf_used; + +/* recvmsg()/sendmsg() data for tap */ +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; + +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; + +/** + * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) + * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only + * @taph: Tap-level headers (partially pre-filled) + * @iph: Pre-filled IP header (except for tot_len and saddr) + * @th: Headroom for TCP header + * @opts: Headroom for TCP options + */ +static struct tcp4_l2_flags_buf_t { +#ifdef __AVX2__ + uint8_t pad[26]; /* 0, align th to 32 bytes */ +#else + uint8_t pad[2]; /* align iph to 4 bytes 0 */ +#endif + struct tap_hdr taph; /* 26 2 */ + struct iphdr iph; /* 44 20 */ + struct tcphdr th; /* 64 40 */ + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))) +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +#endif +tcp4_l2_flags_buf[TCP_FRAMES_MEM]; + +static unsigned int tcp4_l2_flags_buf_used; + +/** + * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) + * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B + * @taph: Tap-level headers (partially pre-filled) + * @ip6h: Pre-filled IP header (except for payload_len and addresses) + * @th: Headroom for TCP header + * @opts: Headroom for TCP options + */ +static struct tcp6_l2_flags_buf_t { +#ifdef __AVX2__ + uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ +#else + uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ +#endif + struct tap_hdr taph; /* 14 2 */ + struct ipv6hdr ip6h; /* 32 20 */ + struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))) +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) +#endif +tcp6_l2_flags_buf[TCP_FRAMES_MEM]; + +static unsigned int tcp6_l2_flags_buf_used; + +/** + * tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses + * @eth_d: Ethernet destination address, NULL if unchanged + * @eth_s: Ethernet source address, NULL if unchanged + */ +void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s) +{ + int i; + + for (i = 0; i < TCP_FRAMES_MEM; i++) { + struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; + struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; + struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; + struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; + + tap_update_mac(&b4->taph, eth_d, eth_s); + tap_update_mac(&b6->taph, eth_d, eth_s); + tap_update_mac(&b4f->taph, eth_d, eth_s); + tap_update_mac(&b6f->taph, eth_d, eth_s); + } +} + +/** + * tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets + * @c: Execution context + */ +void tcp_buf_sock4_iov_init(const struct ctx *c) +{ + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); + struct iovec *iov; + int i; + + for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { + tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { + .taph = TAP_HDR_INIT(ETH_P_IP), + .iph = iph, + .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } + }; + } + + for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { + tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { + .taph = TAP_HDR_INIT(ETH_P_IP), + .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) + }; + } + + for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) + iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); + + for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) + iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); +} + +/** + * tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets + * @c: Execution context + */ +void tcp_buf_sock6_iov_init(const struct ctx *c) +{ + struct iovec *iov; + int i; + + for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { + tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { + .taph = TAP_HDR_INIT(ETH_P_IPV6), + .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), + .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } + }; + } + + for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { + tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { + .taph = TAP_HDR_INIT(ETH_P_IPV6), + .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) + }; + } + + for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) + iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); + + for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) + iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); +} + +/** + * tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags) + * @c: Execution context + */ +void tcp_buf_l2_flags_flush(const struct ctx *c) +{ + tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tcp6_l2_flags_buf_used = 0; + + tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tcp4_l2_flags_buf_used = 0; +} + +/** + * tcp_buf_l2_data_flush() - Send out buffers for segments with data + * @c: Execution context + */ +void tcp_buf_l2_data_flush(const struct ctx *c) +{ + unsigned i; + size_t m; + + m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); + for (i = 0; i < m; i++) + *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; + tcp6_l2_buf_used = 0; + + m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); + for (i = 0; i < m; i++) + *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; + tcp4_l2_buf_used = 0; +} + +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + size_t optlen = 0; + struct iovec *iov; + size_t ip_len; + int ret; + + /* Options: MSS, NOP and window scale (8 bytes) */ + if (flags & SYN) + optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; + + if (CONN_V4(conn)) { + struct tcp4_l2_flags_buf_t *b4; + + iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; + b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; + + ret = do_tcp_send_flag(c, conn, flags, &b4->th, b4->opts, + optlen); + if (ret <= 0) + return ret; + + ip_len = ipv4_fill_headers(c, conn, &b4->iph, optlen, + NULL, conn->seq_to_tap); + + iov->iov_len = tap_iov_len(c, &b4->taph, ip_len); + + if (flags & DUP_ACK) { + + memcpy(b4 + 1, b4, sizeof(*b4)); + (iov + 1)->iov_len = iov->iov_len; + tcp4_l2_flags_buf_used++; + } + + if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) + tcp_buf_l2_flags_flush(c); + } else { + struct tcp6_l2_flags_buf_t *b6; + + iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; + b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; + + ret = do_tcp_send_flag(c, conn, flags, &b6->th, b6->opts, + optlen); + if (ret <= 0) + return ret; + + ip_len = ipv6_fill_headers(c, conn, &b6->ip6h, optlen, + conn->seq_to_tap); + + iov->iov_len = tap_iov_len(c, &b6->taph, ip_len); + + if (flags & DUP_ACK) { + memcpy(b6 + 1, b6, sizeof(*b6)); + (iov + 1)->iov_len = iov->iov_len; + tcp6_l2_flags_buf_used++; + } + + if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) + tcp_buf_l2_flags_flush(c); + } + + return 0; +} + +uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn) +{ + if (CONN_V4(conn)) + return MSS4; + + return MSS6; +} + +/** + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer + * @c: Execution context + * @conn: Connection pointer + * @plen: Payload length at L4 + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer + * @seq: Sequence number to be sent + */ +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, + ssize_t plen, int no_csum, uint32_t seq) +{ + uint32_t *seq_update = &conn->seq_to_tap; + struct iovec *iov; + size_t ip_len; + + if (CONN_V4(conn)) { + struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; + const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; + + tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; + tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; + + ip_len = ipv4_fill_headers(c, conn, &b->iph, plen, + check, seq); + + iov = tcp4_l2_iov + tcp4_l2_buf_used++; + iov->iov_len = tap_iov_len(c, &b->taph, ip_len); + if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) + tcp_buf_l2_data_flush(c); + } else if (CONN_V6(conn)) { + struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; + + tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; + tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; + + ip_len = ipv6_fill_headers(c, conn, &b->ip6h, plen, seq); + + iov = tcp6_l2_iov + tcp6_l2_buf_used++; + iov->iov_len = tap_iov_len(c, &b->taph, ip_len); + if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) + tcp_buf_l2_data_flush(c); + } +} + +/** + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window + * @c: Execution context + * @conn: Connection pointer + * + * Return: negative on connection reset, 0 otherwise + * + * #syscalls recvmsg + */ +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; + int sendlen, len, plen, v4 = CONN_V4(conn); + int s = conn->sock, i, ret = 0; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + uint32_t already_sent, seq; + struct iovec *iov; + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); + if (fill_bufs > TCP_FRAMES) { + fill_bufs = TCP_FRAMES; + iov_rem = 0; + } else { + iov_rem = (wnd_scaled - already_sent) % mss; + } + + mh_sock.msg_iov = iov_sock; + mh_sock.msg_iovlen = fill_bufs + 1; + + iov_sock[0].iov_base = tcp_buf_discard; + iov_sock[0].iov_len = already_sent; + + if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || + (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { + tcp_buf_l2_data_flush(c); + + /* Silence Coverity CWE-125 false positive */ + tcp4_l2_buf_used = tcp6_l2_buf_used = 0; + } + + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { + if (v4) + iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; + else + iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; + iov->iov_len = mss; + } + if (iov_rem) + iov_sock[fill_bufs].iov_len = iov_rem; + + /* Receive into buffers, don't dequeue until acknowledged by guest. */ + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + sendlen = len - already_sent; + if (sendlen <= 0) { + conn_flag(c, conn, STALLED); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + send_bufs = DIV_ROUND_UP(sendlen, mss); + last_len = sendlen - (send_bufs - 1) * mss; + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* Finally, queue to tap */ + plen = mss; + seq = conn->seq_to_tap; + for (i = 0; i < send_bufs; i++) { + int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; + + if (i == send_bufs - 1) + plen = last_len; + + tcp_data_to_tap(c, conn, plen, no_csum, seq); + seq += plen; + } + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; + +err: + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_buf.h b/tcp_buf.h new file mode 100644 index 000000000000..d23031252002 --- /dev/null +++ b/tcp_buf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_BUF_H +#define TCP_BUF_H + +void tcp_buf_sock4_iov_init(const struct ctx *c); +void tcp_buf_sock6_iov_init(const struct ctx *c); +void tcp_buf_l2_flags_flush(const struct ctx *c); +void tcp_buf_l2_data_flush(const struct ctx *c); +uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn); +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); + +#endif /*TCP_BUF_H */ diff --git a/tcp_internal.h b/tcp_internal.h new file mode 100644 index 000000000000..36eb2463dd5a --- /dev/null +++ b/tcp_internal.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef TCP_INTERNAL_H +#define TCP_INTERNAL_H + +#define MAX_WS 8 +#define MAX_WINDOW (1 << (16 + (MAX_WS))) + +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) + +#define FIN (1 << 0) +#define SYN (1 << 1) +#define RST (1 << 2) +#define ACK (1 << 4) + +/* Flags for internal usage */ +#define DUP_ACK (1 << 5) +#define OPT_EOL 0 +#define OPT_NOP 1 +#define OPT_MSS 2 +#define OPT_MSS_LEN 4 +#define OPT_WS 3 +#define OPT_WS_LEN 3 +#define OPT_SACKP 4 +#define OPT_SACK 5 +#define OPT_TS 8 + +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +extern char tcp_buf_discard [MAX_WINDOW]; + +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long flag); +#define conn_flag(c, conn, flag) \ + do { \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + + +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + unsigned long event); +#define conn_event(c, conn, event) \ + do { \ + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); +#define tcp_rst(c, conn) \ + do { \ + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ + tcp_rst_do(c, conn); \ + } while (0) + + +size_t ipv4_fill_headers(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct iphdr *iph, size_t plen, + const uint16_t *check, uint32_t seq); +size_t ipv6_fill_headers(const struct ctx *c, + const struct tcp_tap_conn *conn, + struct ipv6hdr *ip6h, size_t plen, + uint32_t seq); + +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + int force_seq, struct tcp_info *tinfo); +int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, + struct tcphdr *th, char *data, size_t optlen); + +#endif /* TCP_INTERNAL_H */ -- 2.42.0