On Thu, 17 Nov 2022 16:58:44 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Currently, the tables for spliced and non-spliced connections are entirely separate, with different types in different arrays. We want to unify them. As a first step, create a union type which can represent either a spliced or non-spliced connection. For them to be distinguishable, the individual types need to have a common header added, with a bit indicating which type this structure is. This comes at the cost of increasing the size of tcp_tap_conn to over one (64 byte) cacheline. This isn't ideal, but it makes things simpler for now and we'll re-optimize this later. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 4 ++++ tcp_conn.h | 30 ++++++++++++++++++++++++++++++ tcp_splice.c | 2 ++ 3 files changed, 36 insertions(+) diff --git a/tcp.c b/tcp.c index 189041c..05eed85 100644 --- a/tcp.c +++ b/tcp.c @@ -288,6 +288,7 @@ #include <sys/uio.h> #include <unistd.h> #include <time.h> +#include <assert.h> #include <linux/tcp.h> /* For struct tcp_info */ @@ -601,6 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; + assert(!(CONN(index)->c.spliced)); return CONN(index); } @@ -2096,6 +2098,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, } conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2764,6 +2767,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, return; conn = CONN(c->tcp.conn_count++); + conn->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; diff --git a/tcp_conn.h b/tcp_conn.h index db4c2d9..39d104a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -11,8 +11,19 @@ #define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) +/** + * struct tcp_conn_common - Common fields for spliced and non-spliced + * @spliced: Is this a spliced connection? + */ +struct tcp_conn_common { + bool spliced :1; +}; + +extern const char *tcp_common_flag_str[]; + /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) + * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -40,6 +51,9 @@ * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { + /* Must be first element to match tcp_splice_conn */ + struct tcp_conn_common c; + int next_index :TCP_CONN_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 @@ -122,6 +136,7 @@ struct tcp_tap_conn { /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -134,6 +149,9 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { + /* Must be first element to match tcp_tap_conn */ + struct tcp_conn_common c; + int a; int pipe_a_b[2]; int b; @@ -165,4 +183,16 @@ struct tcp_splice_conn { uint32_t b_written; }; +/** + * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * @c: Fields common between all variants + * @tap: Fields specific to non-spliced connections + * @splice: Fields specific to spliced connections +*/ +union tcp_conn { + struct tcp_conn_common c; + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; +};Sorry, I could have noticed earlier: I understand that this is needed to end up, at the end of the series, with a 64-byte tcp_conn, but it doesn't really look like the most natural way of doing things. I would have expected something like: struct tcp_conn { struct tcp_conn_common c; union { struct tcp_tap_conn tap; struct tcp_splice_conn splice; } u; }; but sure, if we do this, then we have 3 bytes between 'c' and 'u', and struct tcp_conn becomes 68 bytes long. It also confuses Coverity Scan, because in tcp_table_compact() we have: memset(hole, 0, sizeof(*hole)); and while the prototype is: void tcp_table_compact(struct ctx *c, union tcp_conn *hole) it sees that we're passing, from tcp_splice_destroy(), something smaller than that (48 bytes), but we're zeroing the whole thing. Of course, it's not a real issue, that space is reserved for a connection slot anyway, but given there are no other issues reported, I'd try to keep Coverity happy if possible. First try, failed: check hole->c.spliced and, if set, zero only sizeof(struct tcp_splice_conn) bytes. This looks like a false positive. Another try, which should probably work (I just hit the daily build submission quota, grr): explicitly pass the union tcp_conn containing our struct tcp_splice_conn. This patch does it: --- diff --git a/tcp.c b/tcp.c index 8874789..d635a8e 100644 --- a/tcp.c +++ b/tcp.c @@ -591,7 +591,7 @@ static size_t tcp6_l2_flags_buf_bytes; union tcp_conn tc[TCP_MAX_CONNS]; #define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_TAP_TO_COMMON(conn) - tc) /** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -1385,7 +1385,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer); tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_TAP_TO_COMMON(conn)); } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..fa407ad 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -176,6 +176,12 @@ union tcp_conn { struct tcp_splice_conn splice; }; +#define TCP_TAP_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, tap))) + +#define TCP_SPLICE_TO_COMMON(x) \ + ((union tcp_conn *)((char *)(x) - offsetof(union tcp_conn, splice))) + /* TCP connections */ extern union tcp_conn tc[]; diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..7d3f17e 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include <limits.h> #include <stdint.h> #include <stdbool.h> +#include <stddef.h> #include <string.h> #include <time.h> #include <unistd.h> @@ -74,7 +75,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) (TCP_SPLICE_TO_COMMON(conn) - tc) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TCP_SPLICE_TO_COMMON(conn)); } /** --- I can add it on top if you agree, assuming it works. I also tried to actually turn tcp_conn into a struct. It takes 68 bytes, so I'm not pursuing this approach, but I'm including the diff just in case you have some quick idea to fix it up: --- diff --git a/tcp.c b/tcp.c index 8874789..6ee5675 100644 --- a/tcp.c +++ b/tcp.c @@ -588,10 +588,10 @@ static unsigned int tcp6_l2_flags_buf_used; static size_t tcp6_l2_flags_buf_bytes; /* TCP connections */ -union tcp_conn tc[TCP_MAX_CONNS]; +struct tcp_conn tc[TCP_MAX_CONNS]; -#define CONN(index) (&tc[(index)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(index) (&tc[(index)].u.tap) +#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc) /** conn_at_idx() - Find a connection by index, if present * @index: Index of connection to lookup @@ -602,7 +602,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) { if ((index < 0) || (index >= TCP_MAX_CONNS)) return NULL; - assert(!(CONN(index)->c.spliced)); + assert(!TO_TCP_CONN(CONN(index))->c.spliced); return CONN(index); } @@ -660,13 +660,13 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, .r.p.tcp.tcp.index = CONN_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { - if (conn->c.in_epoll) + if (TO_TCP_CONN(conn)->c.in_epoll) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); if (conn->timer != -1) epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); @@ -678,7 +678,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) return -errno; - conn->c.in_epoll = true; + TO_TCP_CONN(conn)->c.in_epoll = true; if (conn->timer != -1) { union epoll_ref ref_t = { .r.proto = IPPROTO_TCP, @@ -1347,9 +1347,9 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @hole: Pointer to recently closed connection */ -void tcp_table_compact(struct ctx *c, union tcp_conn *hole) +void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) { - union tcp_conn *from; + struct tcp_conn *from; if (CONN_IDX(hole) == --c->tcp.conn_count) { debug("TCP: table compaction: maximum index was %li (%p)", @@ -1361,14 +1361,15 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole) from = tc + c->tcp.conn_count; memcpy(hole, from, sizeof(*hole)); - if (from->c.spliced) - tcp_splice_conn_update(c, &hole->splice); + if (TO_TCP_CONN(from)->c.spliced) + tcp_splice_conn_update(c, &hole->u.splice); else - tcp_tap_conn_update(c, &from->tap, &hole->tap); + tcp_tap_conn_update(c, &from->u.tap, &hole->u.tap); debug("TCP: table compaction (spliced=%d): old index %li, new index %li, " "from: %p, to: %p", - from->c.spliced, CONN_IDX(from), CONN_IDX(hole), from, hole); + TO_TCP_CONN(from)->c.spliced, CONN_IDX(from), CONN_IDX(hole), + from, hole); memset(from, 0, sizeof(*from)); } @@ -1385,7 +1386,7 @@ static void tcp_conn_destroy(struct ctx *c, struct tcp_tap_conn *conn) close(conn->timer); tcp_hash_remove(c, conn); - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TO_TCP_CONN(conn)); } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); @@ -1523,7 +1524,7 @@ void tcp_defer_handler(struct ctx *c) { int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE; int max_files = c->nofile / 100 * TCP_FILE_PRESSURE; - union tcp_conn *conn; + struct tcp_conn *conn; tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); @@ -1533,12 +1534,12 @@ void tcp_defer_handler(struct ctx *c) return; for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - if (conn->splice.flags & CLOSING) - tcp_splice_destroy(c, &conn->splice); + if (TO_TCP_CONN(conn)->c.spliced) { + if (conn->u.splice.flags & CLOSING) + tcp_splice_destroy(c, &conn->u.splice); } else { - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, &conn->tap); + if (conn->u.tap.events == CLOSED) + tcp_conn_destroy(c, &conn->u.tap); } } @@ -2086,7 +2087,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, } conn = CONN(c->tcp.conn_count++); - conn->c.spliced = false; + TO_TCP_CONN(conn)->c.spliced = false; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2770,7 +2771,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, struct sockaddr *sa, const struct timespec *now) { - conn->c.spliced = false; + TO_TCP_CONN(conn)->c.spliced = false; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2804,7 +2805,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, const struct timespec *now) { struct sockaddr_storage sa; - union tcp_conn *conn; + struct tcp_conn *conn; socklen_t sl; int s; @@ -2826,11 +2827,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn = tc + c->tcp.conn_count++; if (c->mode == MODE_PASTA && - tcp_splice_conn_from_sock(c, ref, &conn->splice, + tcp_splice_conn_from_sock(c, ref, &conn->u.splice, s, (struct sockaddr *)&sa)) return; - tcp_tap_conn_from_sock(c, ref, &conn->tap, s, + tcp_tap_conn_from_sock(c, ref, &conn->u.tap, s, (struct sockaddr *)&sa, now); } @@ -2961,7 +2962,7 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { - union tcp_conn *conn; + struct tcp_conn *conn; if (ref.r.p.tcp.tcp.timer) { tcp_timer_handler(c, ref); @@ -2975,10 +2976,10 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, conn = tc + ref.r.p.tcp.tcp.index; - if (conn->c.spliced) - tcp_splice_sock_handler(c, &conn->splice, ref.r.s, events); + if (TO_TCP_CONN(conn)->c.spliced) + tcp_splice_sock_handler(c, &conn->u.splice, ref.r.s, events); else - tcp_tap_sock_handler(c, &conn->tap, events); + tcp_tap_sock_handler(c, &conn->u.tap, events); } /** @@ -3370,7 +3371,7 @@ static int tcp_port_rebind(void *arg) void tcp_timer(struct ctx *c, const struct timespec *ts) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; - union tcp_conn *conn; + struct tcp_conn *conn; (void)ts; @@ -3394,11 +3395,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) } for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - tcp_splice_timer(c, &conn->splice); + if (TO_TCP_CONN(conn)->c.spliced) { + tcp_splice_timer(c, &conn->u.splice); } else { - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, &conn->tap); + if (conn->u.tap.events == CLOSED) + tcp_conn_destroy(c, &conn->u.tap); } } diff --git a/tcp_conn.h b/tcp_conn.h index 4a8be29..3df7905 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -23,7 +23,6 @@ extern const char *tcp_common_flag_str[]; /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) - * @c: Fields common with tcp_splice_conn * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number @@ -47,9 +46,6 @@ extern const char *tcp_common_flag_str[]; * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { - /* Must be first element to match tcp_splice_conn */ - struct tcp_conn_common c; - int next_index :TCP_CONN_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 @@ -118,7 +114,6 @@ struct tcp_tap_conn { /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection - * @c: Fields common with tcp_tap_conn * @a: File descriptor number of socket for accepted connection * @pipe_a_b: Pipe ends for splice() from @a to @b * @b: File descriptor number of peer connected socket @@ -131,9 +126,6 @@ struct tcp_tap_conn { * @b_written: Bytes written to @b (not fully written from one @a read) */ struct tcp_splice_conn { - /* Must be first element to match tcp_tap_conn */ - struct tcp_conn_common c; - int a; int pipe_a_b[2]; int b; @@ -165,22 +157,27 @@ struct tcp_splice_conn { }; /** - * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) + * struct tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) * @c: Fields common between all variants - * @tap: Fields specific to non-spliced connections - * @splice: Fields specific to spliced connections + * @u.tap: Fields specific to non-spliced connections + * @u.splice: Fields specific to spliced connections */ -union tcp_conn { +struct tcp_conn { struct tcp_conn_common c; - struct tcp_tap_conn tap; - struct tcp_splice_conn splice; + union { + struct tcp_tap_conn tap; + struct tcp_splice_conn splice; + } u; }; +#define TO_TCP_CONN(x) \ + ((struct tcp_conn *)((char *)(x) - offsetof(struct tcp_conn, u))) + /* TCP connections */ -extern union tcp_conn tc[]; +extern struct tcp_conn tc[]; void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new); -void tcp_table_compact(struct ctx *c, union tcp_conn *hole); +void tcp_table_compact(struct ctx *c, struct tcp_conn *hole); void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_timer(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_pipe_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index e2f0ce1..04fc513 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -37,6 +37,7 @@ #include <limits.h> #include <stdint.h> #include <stdbool.h> +#include <stddef.h> #include <string.h> #include <time.h> #include <unistd.h> @@ -73,8 +74,8 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; #define CONN_V6(x) (x->flags & SPLICE_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) -#define CONN(index) (&tc[(index)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(index) (&tc[(index)].u.splice) +#define CONN_IDX(conn) (TO_TCP_CONN(conn) - tc) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -165,7 +166,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int m = conn->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = TO_TCP_CONN(conn)->c.in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.index = CONN_IDX(conn) }; union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, @@ -185,7 +186,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, epoll_ctl(c->epollfd, m, conn->b, &ev_b)) goto delete; - conn->c.in_epoll = true; + TO_TCP_CONN(conn)->c.in_epoll = true; return 0; @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); c->tcp.splice_conn_count--; - tcp_table_compact(c, (union tcp_conn *)conn); + tcp_table_compact(c, TO_TCP_CONN(conn)); } /** @@ -535,7 +536,7 @@ bool tcp_splice_conn_from_sock(struct ctx *c, union epoll_ref ref, if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s); - conn->c.spliced = true; + TO_TCP_CONN(conn)->c.spliced = true; c->tcp.splice_conn_count++; conn->a = s; --- I'm fine with this series in any case. If you don't have other ideas, I would just try to get rid of that warning (Out-of-bounds access, CWE-119) with the first diff here, or something similar. -- Stefano