On Fri, 13 Sep 2024 14:32:06 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:When available, we want to retrieve our socket peer's advertised window and forward that to the guest. That information has been available from the kernel via the TCP_INFO getsockopt() since kernel commit 8f7baad7f035. Currently our probing for this is a bit odd. The HAS_SND_WND define determines if our headers include the tcp_snd_wnd field, but that doesn't necessarily mean the running kernel supports it. Currently we start by assuming it's _not_ available, but mark it as available if we ever see a non-zero value in the field. This is a bit hit and miss in two ways: * Zero is perfectly possible window the peer could report, so we can get false negativesKind of: one non-zero result was enough to set tcp.kernel_snd_wnd to one. The reason why I implemented it that way was to account for possible kernel backports of that option. On the other hand, any kernel backport would need to preserve the position of tcpi_snd_wnd, regardless of whether preceding fields are missing, so checking the size as you're doing also looks robust, and avoids these two issues altogether.* We're reading TCP_INFO into a local variable, which might not be zero initialised, so if the kernel _doesn't_ write it it could have non-zero garbage, giving us false positives. We can use a more direct way of probing for this: getsockopt() reports the length of the information retreived. So, check whether that's long enough to include the field. This lets us probe the availability of the field once and for all during initialisation. That in turn allows ctx to become a const pointer to tcp_prepare_flags() which cascades through many other functions. We also move the flag for the probe result from the ctx structure to a global, to match peek_offset_cap. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 93 ++++++++++++++++++++++++++++++++++++-------------- tcp.h | 13 +++---- tcp_buf.c | 10 +++--- tcp_buf.h | 6 ++-- tcp_internal.h | 4 +-- 5 files changed, 82 insertions(+), 44 deletions(-) diff --git a/tcp.c b/tcp.c index 14b48a84..cba3f3bd 100644 --- a/tcp.c +++ b/tcp.c @@ -308,11 +308,6 @@ /* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 #define WINDOW_DEFAULT 14600 /* RFC 6928 */ -#ifdef HAS_SND_WND -# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd) -#else -# define KERNEL_REPORTS_SND_WND(c) (0 && (c)) -#endif #define ACK_INTERVAL 10 /* ms */ #define SYN_TIMEOUT 10 /* s */ @@ -370,6 +365,14 @@ char tcp_buf_discard [MAX_WINDOW]; /* Does the kernel support TCP_PEEK_OFF? */ bool peek_offset_cap; +#ifdef HAS_SND_WND +/* Does the kernel report sending window in TCP_INFO (kernel commit + * 8f7baad7f035) + */ +bool snd_wnd_cap; +#else +#define snd_wnd_cap (false) +#endif /* sendmsg() to socket */ static struct iovec tcp_iov [UIO_MAXIOV]; @@ -1052,7 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, } #endif /* !HAS_BYTES_ACKED */ - if (!KERNEL_REPORTS_SND_WND(c)) { + if (!snd_wnd_cap) { tcp_get_sndbuf(conn); new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, @@ -1136,7 +1139,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * 0 if there is no flag to send * 1 otherwise */ -int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen) { @@ -1153,11 +1156,6 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, return -ECONNRESET; } -#ifdef HAS_SND_WND - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) - c->tcp.kernel_snd_wnd = 1; -#endif - if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); @@ -1235,7 +1233,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, + int flags) { return tcp_buf_send_flag(c, conn, flags); } @@ -1245,7 +1244,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @c: Execution context * @conn: Connection pointer */ -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) { if (conn->events == CLOSED) return; @@ -1463,7 +1462,7 @@ static void tcp_bind_outbound(const struct ctx *c, * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ -static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, +static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) @@ -1628,7 +1627,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { return tcp_buf_data_from_sock(c, conn); } @@ -1644,8 +1643,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) * * Return: count of consumed packets */ -static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, - const struct pool *p, int idx) +static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + const struct pool *p, int idx) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; @@ -1842,7 +1841,8 @@ out: * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length */ -static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_conn_from_sock_finish(const struct ctx *c, + struct tcp_tap_conn *conn, const struct tcphdr *th, const char *opts, size_t optlen) { @@ -1885,7 +1885,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { @@ -2023,7 +2023,7 @@ reset: * @c: Execution context * @conn: Connection pointer */ -static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) { socklen_t sl; int so; @@ -2049,8 +2049,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) * @sa: Peer socket address (from accept()) * @now: Current timestamp */ -static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, - const struct timespec *now) +static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, + int s, const struct timespec *now) { struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); uint64_t hash; @@ -2081,7 +2081,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, * @ref: epoll reference of listening socket * @now: Current timestamp */ -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now) { const struct flowside *ini; @@ -2146,7 +2146,7 @@ cancel: * * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 */ -void tcp_timer_handler(struct ctx *c, union epoll_ref ref) +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) { struct itimerspec check_armed = { { 0 }, { 0 } }; struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; @@ -2210,7 +2210,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) * @ref: epoll reference * @events: epoll events bitmap */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events) { struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); @@ -2494,6 +2495,40 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) return ret; } +#ifdef HAS_SND_WND +/** + * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd + * + * Return: true if supported, false otherwise + */ +static bool tcp_probe_snd_wnd_cap(void) +{ + struct tcp_info tinfo; + socklen_t sl = sizeof(tinfo); + int s; + + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + warn_perror("Temporary TCP socket creation failed"); + return false; + } + + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { + warn_perror("Failed to get TCP_INFO on temporary socket"); + close(s); + return false; + } + + close(s); + + if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) + + sizeof(tinfo.tcpi_snd_wnd))) + return false; + + return true; +} +#endif /* HAS_SND_WND */ + /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context @@ -2527,6 +2562,12 @@ int tcp_init(struct ctx *c) (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); +#ifdef HAS_SND_WND + snd_wnd_cap = tcp_probe_snd_wnd_cap(); +#endif + debug("TCP_INFO tcpi_snd_wnd field%ssupported", + snd_wnd_cap ? " " : " not "); + return 0; } diff --git a/tcp.h b/tcp.h index e9ff0191..5585924f 100644 --- a/tcp.h +++ b/tcp.h @@ -10,11 +10,12 @@ struct ctx; -void tcp_timer_handler(struct ctx *c, union epoll_ref ref); -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref); +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now); -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events); +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, @@ -58,16 +59,12 @@ union tcp_listen_epoll_ref { * @fwd_in: Port forwarding configuration for inbound packets * @fwd_out: Port forwarding configuration for outbound packets * @timer_run: Timestamp of most recent timer run - * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @pipe_size: Size of pipes for spliced connections */ struct tcp_ctx { struct fwd_ports fwd_in; struct fwd_ports fwd_out; struct timespec timer_run; -#ifdef HAS_SND_WND - int kernel_snd_wnd; -#endif size_t pipe_size; }; diff --git a/tcp_buf.c b/tcp_buf.c index 1a398461..c886c92b 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -239,7 +239,7 @@ void tcp_flags_flush(const struct ctx *c) * @frames: Two-dimensional array containing queued frames with sub-iovs * @num_frames: Number of entries in the two arrays to be compared */ -static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, +static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) { int i; @@ -264,7 +264,7 @@ static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, * tcp_payload_flush() - Send out buffers for segments with data * @c: Execution context */ -void tcp_payload_flush(struct ctx *c) +void tcp_payload_flush(const struct ctx *c) { size_t m; @@ -293,7 +293,7 @@ void tcp_payload_flush(struct ctx *c) * * Return: negative error code on connection reset, 0 otherwise */ -int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) { struct tcp_flags_t *payload; struct iovec *iov; @@ -361,7 +361,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @seq: Sequence number to be sent */ -static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t dlen, int no_csum, uint32_t seq) { struct iovec *iov; @@ -405,7 +405,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; diff --git a/tcp_buf.h b/tcp_buf.h index 3db4c56e..8d4b615a 100644 --- a/tcp_buf.h +++ b/tcp_buf.h @@ -9,8 +9,8 @@ void tcp_sock4_iov_init(const struct ctx *c); void tcp_sock6_iov_init(const struct ctx *c); void tcp_flags_flush(const struct ctx *c); -void tcp_payload_flush(struct ctx *c); -int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); -int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); +void tcp_payload_flush(const struct ctx *c); +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); #endif /*TCP_BUF_H */ diff --git a/tcp_internal.h b/tcp_internal.h index aa8bb64f..bd634be1 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -82,7 +82,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, conn_event_do(c, conn, event); \ } while (0) -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); #define tcp_rst(c, conn) \ do { \ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ @@ -94,7 +94,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, const uint16_t *check, uint32_t seq); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, int force_seq, struct tcp_info *tinfo); -int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags, +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t *optlen); #endif /* TCP_INTERNAL_H */-- Stefano