The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving us read cycles when forwarding TCP messages in the host->name space direction. When this feature is supported, iov_sock[0].iov_base can be set to NULL. The kernel code will interpret this as an instruction to skip reading of the first iov_sock[0].iov_len bytes of the message. Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this by making this into a pointer, setting it to NULL if we find that the feature is supported by the kernel, an letting it point to a static buffer if not. There is no macro or function indicating kernel support for this feature. We therefore have to probe for it by creating a temporary tcp connection and read from it as if the feature is present. If that fails, we fall back to the original design. The traffic connection cannot be used for this purpose, because it will be broken if the probe reading fails. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: Changes as suggested by Stefano Brivio: - Moved probe process/function into a separate, temporary name space, to avoid occupying port numbers in the current name space. - Put discard buffer back to static memory. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- netlink.c | 2 +- netlink.h | 1 + tcp.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/netlink.c b/netlink.c index 379d46e..d5f10de 100644 --- a/netlink.c +++ b/netlink.c @@ -55,7 +55,7 @@ static int nl_seq = 1; * * Return: 0 */ -static int nl_sock_init_do(void *arg) +int nl_sock_init_do(void *arg) { struct sockaddr_nl addr = { .nl_family = AF_NETLINK, }; int *s = arg ? &nl_sock_ns : &nl_sock; diff --git a/netlink.h b/netlink.h index 3a1f0de..cadd3b7 100644 --- a/netlink.h +++ b/netlink.h @@ -24,5 +24,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, int nl_link_get_mac(int s, unsigned int ifi, void *mac); int nl_link_set_mac(int s, unsigned int ifi, const void *mac); int nl_link_up(int s, unsigned int ifi, int mtu); +int nl_sock_init_do(void *arg); #endif /* NETLINK_H */ diff --git a/tcp.c b/tcp.c index f506cfd..4410460 100644 --- a/tcp.c +++ b/tcp.c @@ -283,6 +283,7 @@ #include <sys/timerfd.h> #include <sys/types.h> #include <sys/uio.h> +#include <sys/wait.h> #include <time.h> #include <linux/tcp.h> /* For struct tcp_info */ @@ -297,6 +298,7 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "netlink.h" #include "tcp_conn.h" #include "flow_table.h" @@ -402,6 +404,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) +static bool tcp_probe_msg_peek_offset_cap(); + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; +static char tcp_discard_buf[MAX_WINDOW]; +static char* tcp_buf_discard = tcp_discard_buf; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; @@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) + +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? + */ +static inline bool msg_peek_offset_cap() +{ + return !tcp_buf_discard; +} + + /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup * @@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!msg_peek_offset_cap()) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Ignore discard buffer if not needed */ + if (tcp_probe_msg_peek_offset_cap()) + tcp_buf_discard = NULL; return 0; } @@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +static int tcp_probe_sockets(void *arg) +{ + char b; + struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, }; + struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0}; + struct sockaddr a = { AF_INET, {0, }}; + int err = EXIT_FAILURE; + int s[2] = {0, }; + int s_recv = 0; + int *rc = arg; + ssize_t len; + + /* Make sure loopback interface is enabled */ + nl_sock_init_do(NULL); + nl_link_up(nl_sock, 1 /* lo */, 0); + + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s[0] < 0 || s[1] < 0) { + perror("Temporary probe socket creation failed\n"); + goto out; + } + if (0 > bind(s[0], &a, sizeof(a))) { + perror("Temporary probe socket bind() failed\n"); + goto out; + } + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { + perror("Temporary probe socket getsockname() failed\n"); + goto out; + } + if (0 > listen(s[0], 0)) { + perror("Temporary probe socket listen() failed\n"); + goto out; + } + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { + perror("Temporary probe socket connect() failed\n"); + goto out; + } + s_recv = accept(s[0], NULL, NULL); + if (s_recv <= 0) { + perror("Temporary probe socket accept() failed\n"); + goto out; + } + if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) { + perror("Temporary probe socket send() failed\n"); + goto out; + } + len = recvmsg(s_recv, &msg, MSG_PEEK); + if (len <= 0 && errno != EFAULT) { + perror("Temporary probe socket recvmsg() failed\n"); + goto out; + } + printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not "); + *rc = len == 1; + err = EXIT_SUCCESS; +out: + close(s_recv); + close(s[1]); + close(s[0]); + return err; +} + +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap() +{ + char ns_fn_stack[NS_FN_STACK_SIZE]; + int child_pid, child_ret, rc = 0; + + child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack), + CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)&rc); + if (child_pid <= 0) { + perror("Failed to clone tcp probe process\n"); + exit(EXIT_FAILURE); + } + waitpid(child_pid, &child_ret, 0); + return rc; +} -- 2.39.2